493 files changed, 97010 insertions, 404 deletions
diff --git a/core/input/input_event.h b/core/input/input_event.h
index a1e7df5969..94aa68db33 100644
--- a/core/input/input_event.h
+++ b/core/input/input_event.h
@@ -33,7 +33,6 @@
 
 #include "core/io/resource.h"
 #include "core/math/transform_2d.h"
-#include "core/os/copymem.h"
 #include "core/string/ustring.h"
 #include "core/typedefs.h"
 
diff --git a/core/io/compression.cpp b/core/io/compression.cpp
index 980234cbfc..6de626db99 100644
--- a/core/io/compression.cpp
+++ b/core/io/compression.cpp
@@ -32,7 +32,6 @@
 
 #include "core/config/project_settings.h"
 #include "core/io/zip_io.h"
-#include "core/os/copymem.h"
 
 #include "thirdparty/misc/fastlz.h"
 
@@ -44,8 +43,8 @@ int Compression::compress(uint8_t *p_dst, const uint8_t *p_src, int p_src_size,
 		case MODE_FASTLZ: {
 			if (p_src_size < 16) {
 				uint8_t src[16];
-				zeromem(&src[p_src_size], 16 - p_src_size);
-				copymem(src, p_src, p_src_size);
+				memset(&src[p_src_size], 0, 16 - p_src_size);
+				memcpy(src, p_src, p_src_size);
 				return fastlz_compress(src, 16, p_dst);
 			} else {
 				return fastlz_compress(p_src, p_src_size, p_dst);
@@ -136,7 +135,7 @@ int Compression::decompress(uint8_t *p_dst, int p_dst_max_size, const uint8_t *p
 			if (p_dst_max_size < 16) {
 				uint8_t dst[16];
 				ret_size = fastlz_decompress(p_src, p_src_size, dst, 16);
-				copymem(p_dst, dst, p_dst_max_size);
+				memcpy(p_dst, dst, p_dst_max_size);
 			} else {
 				ret_size = fastlz_decompress(p_src, p_src_size, p_dst, p_dst_max_size);
 			}
diff --git a/core/io/file_access_encrypted.cpp b/core/io/file_access_encrypted.cpp
index 8ace897f18..13377a3a25 100644
--- a/core/io/file_access_encrypted.cpp
+++ b/core/io/file_access_encrypted.cpp
@@ -31,7 +31,6 @@
 #include "file_access_encrypted.h"
 
 #include "core/crypto/crypto_core.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 #include "core/variant/variant.h"
 
@@ -151,7 +150,7 @@ void FileAccessEncrypted::_release() {
 		ERR_FAIL_COND(CryptoCore::md5(data.ptr(), data.size(), hash) != OK); // Bug?
 
 		compressed.resize(len);
-		zeromem(compressed.ptrw(), len);
+		memset(compressed.ptrw(), 0, len);
 		for (int i = 0; i < data.size(); i++) {
 			compressed.write[i] = data[i];
 		}
diff --git a/core/io/file_access_memory.cpp b/core/io/file_access_memory.cpp
index 58670d5246..af155a77a8 100644
--- a/core/io/file_access_memory.cpp
+++ b/core/io/file_access_memory.cpp
@@ -31,7 +31,6 @@
 #include "file_access_memory.h"
 
 #include "core/config/project_settings.h"
-#include "core/os/copymem.h"
 #include "core/os/dir_access.h"
 #include "core/templates/map.h"
 
@@ -149,7 +148,7 @@ int FileAccessMemory::get_buffer(uint8_t *p_dst, int p_length) const {
 		WARN_PRINT("Reading less data than requested");
 	}
 
-	copymem(p_dst, &data[pos], read);
+	memcpy(p_dst, &data[pos], read);
 	pos += p_length;
 
 	return read;
@@ -176,6 +175,6 @@ void FileAccessMemory::store_buffer(const uint8_t *p_src, int p_length) {
 		WARN_PRINT("Writing less data than requested");
 	}
 
-	copymem(&data[pos], p_src, write);
+	memcpy(&data[pos], p_src, write);
 	pos += p_length;
 }
diff --git a/core/io/file_access_zip.cpp b/core/io/file_access_zip.cpp
index 586c988974..397b577612 100644
--- a/core/io/file_access_zip.cpp
+++ b/core/io/file_access_zip.cpp
@@ -32,7 +32,6 @@
 
 #include "file_access_zip.h"
 
-#include "core/os/copymem.h"
 #include "core/os/file_access.h"
 
 ZipArchive *ZipArchive::instance = nullptr;
@@ -120,7 +119,7 @@ unzFile ZipArchive::get_file_handle(String p_file) const {
 	ERR_FAIL_COND_V_MSG(!f, nullptr, "Cannot open file '" + packages[file.package].filename + "'.");
 
 	zlib_filefunc_def io;
-	zeromem(&io, sizeof(io));
+	memset(&io, 0, sizeof(io));
 
 	io.opaque = f;
 	io.zopen_file = godot_open;
diff --git a/core/io/http_client.cpp b/core/io/http_client.cpp
index 3863dce0f6..4b053d576c 100644
--- a/core/io/http_client.cpp
+++ b/core/io/http_client.cpp
@@ -633,7 +633,7 @@ PackedByteArray HTTPClient::read_response_body_chunk() {
 
 					ret.resize(chunk.size() - 2);
 					uint8_t *w = ret.ptrw();
-					copymem(w, chunk.ptr(), chunk.size() - 2);
+					memcpy(w, chunk.ptr(), chunk.size() - 2);
 					chunk.clear();
 				}
 
diff --git a/core/io/image.cpp b/core/io/image.cpp
index 873eb66f33..c36fa6e45f 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -34,7 +34,6 @@
 #include "core/io/image_loader.h"
 #include "core/io/resource_loader.h"
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 #include "core/templates/hash_map.h"
 
@@ -1537,7 +1536,7 @@ void Image::shrink_x2() {
 			uint8_t *w = new_img.ptrw();
 			const uint8_t *r = data.ptr();
 
-			copymem(w, &r[ofs], new_size);
+			memcpy(w, &r[ofs], new_size);
 		}
 
 		width = MAX(width / 2, 1);
@@ -1932,7 +1931,7 @@ Error Image::generate_mipmap_roughness(RoughnessChannel p_roughness_channel, con
 
 
 			uint8_t* wr = imgdata.ptrw();
-			copymem(wr.ptr(), ptr, size);
+			memcpy(wr.ptr(), ptr, size);
 			wr = uint8_t*();
 			Ref<Image> im;
 			im.instance();
@@ -1982,7 +1981,7 @@ void Image::create(int p_width, int p_height, bool p_use_mipmaps, Format p_forma
 
 	{
 		uint8_t *w = data.ptrw();
-		zeromem(w, size);
+		memset(w, 0, size);
 	}
 
 	width = p_width;
@@ -3295,7 +3294,7 @@ Ref<Image> Image::get_image_from_mipmap(int p_mipamp) const {
 	{
 		uint8_t *wr = new_data.ptrw();
 		const uint8_t *rd = data.ptr();
-		copymem(wr, rd + ofs, size);
+		memcpy(wr, rd + ofs, size);
 	}
 
 	Ref<Image> image;
@@ -3622,5 +3621,5 @@ Ref<Resource> Image::duplicate(bool p_subresources) const {
 }
 
 void Image::set_as_black() {
-	zeromem(data.ptrw(), data.size());
+	memset(data.ptrw(), 0, data.size());
 }
diff --git a/core/io/marshalls.cpp b/core/io/marshalls.cpp
index 218a612da2..0282609270 100644
--- a/core/io/marshalls.cpp
+++ b/core/io/marshalls.cpp
@@ -851,7 +851,7 @@ static void _encode_string(const String &p_string, uint8_t *&buf, int &r_len) {
 	if (buf) {
 		encode_uint32(utf8.length(), buf);
 		buf += 4;
-		copymem(buf, utf8.get_data(), utf8.length());
+		memcpy(buf, utf8.get_data(), utf8.length());
 		buf += utf8.length();
 	}
 
@@ -995,7 +995,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length(), buf);
 					buf += 4;
-					copymem(buf, utf8.get_data(), utf8.length());
+					memcpy(buf, utf8.get_data(), utf8.length());
 					buf += pad + utf8.length();
 				}
 
@@ -1079,7 +1079,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Transform2D val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 2; j++) {
-						copymem(&buf[(i * 2 + j) * 4], &val.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 2 + j) * 4], &val.elements[i][j], sizeof(float));
 					}
 				}
 			}
@@ -1130,7 +1130,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Basis val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
-						copymem(&buf[(i * 3 + j) * 4], &val.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 3 + j) * 4], &val.elements[i][j], sizeof(float));
 					}
 				}
 			}
@@ -1143,7 +1143,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Transform val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
-						copymem(&buf[(i * 3 + j) * 4], &val.basis.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 3 + j) * 4], &val.basis.elements[i][j], sizeof(float));
 					}
 				}
 
@@ -1258,7 +1258,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length()+1,buf);
 					buf+=4;
-					copymem(buf,utf8.get_data(),utf8.length()+1);
+					memcpy(buf,utf8.get_data(),utf8.length()+1);
 				}
 
 				r_len+=4+utf8.length()+1;
@@ -1314,7 +1314,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				encode_uint32(datalen, buf);
 				buf += 4;
 				const uint8_t *r = data.ptr();
-				copymem(buf, &r[0], datalen * datasize);
+				memcpy(buf, &r[0], datalen * datasize);
 				buf += datalen * datasize;
 			}
 
@@ -1412,7 +1412,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length() + 1, buf);
 					buf += 4;
-					copymem(buf, utf8.get_data(), utf8.length() + 1);
+					memcpy(buf, utf8.get_data(), utf8.length() + 1);
 					buf += utf8.length() + 1;
 				}
 
diff --git a/core/io/multiplayer_api.cpp b/core/io/multiplayer_api.cpp
index 94060cfe0b..8414ee7c0c 100644
--- a/core/io/multiplayer_api.cpp
+++ b/core/io/multiplayer_api.cpp
@@ -897,7 +897,7 @@ void MultiplayerAPI::_send_rpc(Node *p_from, int p_to, bool p_unreliable, bool p
 			// Special optimization when only the byte vector is sent.
 			const Vector<uint8_t> data = *p_arg[0];
 			MAKE_ROOM(ofs + data.size());
-			copymem(&(packet_cache.write[ofs]), data.ptr(), sizeof(uint8_t) * data.size());
+			memcpy(&(packet_cache.write[ofs]), data.ptr(), sizeof(uint8_t) * data.size());
 			ofs += data.size();
 		} else {
 			// Arguments
diff --git a/core/io/packed_data_container.cpp b/core/io/packed_data_container.cpp
index a0b97772e6..c6354b11b7 100644
--- a/core/io/packed_data_container.cpp
+++ b/core/io/packed_data_container.cpp
@@ -317,7 +317,7 @@ Error PackedDataContainer::pack(const Variant &p_data) {
 	datalen = tmpdata.size();
 	data.resize(tmpdata.size());
 	uint8_t *w = data.ptrw();
-	copymem(w, tmpdata.ptr(), tmpdata.size());
+	memcpy(w, tmpdata.ptr(), tmpdata.size());
 
 	return OK;
 }
diff --git a/core/io/stream_peer.cpp b/core/io/stream_peer.cpp
index 8407d55196..74154321b3 100644
--- a/core/io/stream_peer.cpp
+++ b/core/io/stream_peer.cpp
@@ -433,7 +433,7 @@ Error StreamPeerBuffer::put_data(const uint8_t *p_data, int p_bytes) {
 	}
 
 	uint8_t *w = data.ptrw();
-	copymem(&w[pointer], p_data, p_bytes);
+	memcpy(&w[pointer], p_data, p_bytes);
 
 	pointer += p_bytes;
 	return OK;
@@ -466,7 +466,7 @@ Error StreamPeerBuffer::get_partial_data(uint8_t *p_buffer, int p_bytes, int &r_
 	}
 
 	const uint8_t *r = data.ptr();
-	copymem(p_buffer, r + pointer, r_received);
+	memcpy(p_buffer, r + pointer, r_received);
 
 	pointer += r_received;
 	// FIXME: return what? OK or ERR_*
diff --git a/core/io/xml_parser.cpp b/core/io/xml_parser.cpp
index d5eb32513b..a1f8e79adc 100644
--- a/core/io/xml_parser.cpp
+++ b/core/io/xml_parser.cpp
@@ -433,7 +433,7 @@ Error XMLParser::open_buffer(const Vector<uint8_t> &p_buffer) {
 
 	length = p_buffer.size();
 	data = memnew_arr(char, length + 1);
-	copymem(data, p_buffer.ptr(), length);
+	memcpy(data, p_buffer.ptr(), length);
 	data[length] = 0;
 	P = data;
 	return OK;
diff --git a/core/io/zip_io.cpp b/core/io/zip_io.cpp
index 4b4a46e198..fe46868dd0 100644
--- a/core/io/zip_io.cpp
+++ b/core/io/zip_io.cpp
@@ -30,8 +30,6 @@
 
 #include "zip_io.h"
 
-#include "core/os/copymem.h"
-
 void *zipio_open(void *data, const char *p_fname, int mode) {
 	FileAccess *&f = *(FileAccess **)data;
 
@@ -103,7 +101,7 @@ int zipio_testerror(voidpf opaque, voidpf stream) {
 
 voidpf zipio_alloc(voidpf opaque, uInt items, uInt size) {
 	voidpf ptr = memalloc(items * size);
-	zeromem(ptr, items * size);
+	memset(ptr, 0, items * size);
 	return ptr;
 }
 
diff --git a/core/math/basis.cpp b/core/math/basis.cpp
index cc2b7c6611..50299902eb 100644
--- a/core/math/basis.cpp
+++ b/core/math/basis.cpp
@@ -31,7 +31,6 @@
 #include "basis.h"
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 
 #define cofac(row1, col1, row2, col2) \
diff --git a/core/math/dynamic_bvh.h b/core/math/dynamic_bvh.h
index 3fb22515a2..0b6286cd9d 100644
--- a/core/math/dynamic_bvh.h
+++ b/core/math/dynamic_bvh.h
@@ -343,7 +343,7 @@ void DynamicBVH::aabb_query(const AABB &p_box, QueryResult &r_result) {
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
@@ -399,7 +399,7 @@ void DynamicBVH::convex_query(const Plane *p_planes, int p_plane_count, const Ve
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
@@ -456,7 +456,7 @@ void DynamicBVH::ray_query(const Vector3 &p_from, const Vector3 &p_to, QueryResu
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
diff --git a/core/math/face3.cpp b/core/math/face3.cpp
index beb0a8e405..20c316c322 100644
--- a/core/math/face3.cpp
+++ b/core/math/face3.cpp
@@ -169,7 +169,7 @@ Vector3 Face3::get_median_point() const {
 }
 
 real_t Face3::get_area() const {
-	return vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]).length();
+	return vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]).length() * 0.5;
 }
 
 ClockDirection Face3::get_clock_dir() const {
diff --git a/core/math/geometry_2d.cpp b/core/math/geometry_2d.cpp
index feb1fb2fb8..7b2630b4ff 100644
--- a/core/math/geometry_2d.cpp
+++ b/core/math/geometry_2d.cpp
@@ -358,7 +358,7 @@ Vector<Point2i> Geometry2D::pack_rects(const Vector<Size2i> &p_sizes, const Size
 Vector<Vector3i> Geometry2D::partial_pack_rects(const Vector<Vector2i> &p_sizes, const Size2i &p_atlas_size) {
 	Vector<stbrp_node> nodes;
 	nodes.resize(p_atlas_size.width);
-	zeromem(nodes.ptrw(), sizeof(stbrp_node) * nodes.size());
+	memset(nodes.ptrw(), 0, sizeof(stbrp_node) * nodes.size());
 
 	stbrp_context context;
 	stbrp_init_target(&context, p_atlas_size.width, p_atlas_size.height, nodes.ptrw(), p_atlas_size.width);
diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h
index 267f6a4fe2..8cf13efdb6 100644
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@@ -103,6 +103,9 @@ public:
 	static _ALWAYS_INLINE_ double log(double p_x) { return ::log(p_x); }
 	static _ALWAYS_INLINE_ float log(float p_x) { return ::logf(p_x); }
 
+	static _ALWAYS_INLINE_ double log2(double p_x) { return ::log2(p_x); }
+	static _ALWAYS_INLINE_ float log2(float p_x) { return ::log2f(p_x); }
+
 	static _ALWAYS_INLINE_ double exp(double p_x) { return ::exp(p_x); }
 	static _ALWAYS_INLINE_ float exp(float p_x) { return ::expf(p_x); }
 
diff --git a/core/math/transform.cpp b/core/math/transform.cpp
index fab5d124fa..d4d7ff6d28 100644
--- a/core/math/transform.cpp
+++ b/core/math/transform.cpp
@@ -31,7 +31,6 @@
 #include "transform.h"
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 
 void Transform::affine_invert() {
diff --git a/core/object/callable_method_pointer.h b/core/object/callable_method_pointer.h
index 115797a00c..8ba01be4e4 100644
--- a/core/object/callable_method_pointer.h
+++ b/core/object/callable_method_pointer.h
@@ -32,7 +32,6 @@
 #define CALLABLE_METHOD_POINTER_H
 
 #include "core/object/object.h"
-#include "core/os/copymem.h"
 #include "core/templates/hashfuncs.h"
 #include "core/templates/simple_type.h"
 #include "core/variant/binder_common.h"
@@ -98,7 +97,7 @@ public:
 	}
 
 	CallableCustomMethodPointer(T *p_instance, void (T::*p_method)(P...)) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
@@ -153,7 +152,7 @@ public:
 	}
 
 	CallableCustomMethodPointerRet(T *p_instance, R (T::*p_method)(P...)) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
@@ -208,7 +207,7 @@ public:
 	}
 
 	CallableCustomMethodPointerRetC(T *p_instance, R (T::*p_method)(P...) const) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
diff --git a/core/os/memory.cpp b/core/os/memory.cpp
index 5910cb0e7b..a756c1d5dd 100644
--- a/core/os/memory.cpp
+++ b/core/os/memory.cpp
@@ -31,7 +31,6 @@
 #include "memory.h"
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/templates/safe_refcount.h"
 
 #include <stdio.h>
diff --git a/core/os/pool_allocator.cpp b/core/os/pool_allocator.cpp
index 9be3a62e2f..74e9c24e04 100644
--- a/core/os/pool_allocator.cpp
+++ b/core/os/pool_allocator.cpp
@@ -31,7 +31,6 @@
 #include "pool_allocator.h"
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/os/os.h"
 #include "core/string/print_string.h"
@@ -42,7 +41,7 @@
 	do {                                                      \
 		void *_dst = &((unsigned char *)pool)[m_to_pos];      \
 		void *_src = &((unsigned char *)pool)[(m_entry).pos]; \
-		movemem(_dst, _src, aligned((m_entry).len));          \
+		memmove(_dst, _src, aligned((m_entry).len));          \
 		(m_entry).pos = m_to_pos;                             \
 	} while (0);
 
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index cf0040353d..c8d71c3236 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -4765,7 +4765,7 @@ Vector<uint8_t> String::to_ascii_buffer() const {
 	size_t len = charstr.length();
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return retval;
 }
@@ -4781,7 +4781,7 @@ Vector<uint8_t> String::to_utf8_buffer() const {
 	size_t len = charstr.length();
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return retval;
 }
@@ -4797,7 +4797,7 @@ Vector<uint8_t> String::to_utf16_buffer() const {
 	size_t len = charstr.length() * sizeof(char16_t);
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, (const void *)charstr.ptr(), len);
+	memcpy(w, (const void *)charstr.ptr(), len);
 
 	return retval;
 }
@@ -4812,7 +4812,7 @@ Vector<uint8_t> String::to_utf32_buffer() const {
 	size_t len = s->length() * sizeof(char32_t);
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, (const void *)s->ptr(), len);
+	memcpy(w, (const void *)s->ptr(), len);
 
 	return retval;
 }
diff --git a/core/templates/local_vector.h b/core/templates/local_vector.h
index ffd17b7ee9..5f22e08eb8 100644
--- a/core/templates/local_vector.h
+++ b/core/templates/local_vector.h
@@ -32,7 +32,6 @@
 #define LOCAL_VECTOR_H
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/sort_array.h"
 #include "core/templates/vector.h"
@@ -216,7 +215,7 @@ public:
 		Vector<T> ret;
 		ret.resize(size());
 		T *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
@@ -224,7 +223,7 @@ public:
 		Vector<uint8_t> ret;
 		ret.resize(count * sizeof(T));
 		uint8_t *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
diff --git a/core/templates/oa_hash_map.h b/core/templates/oa_hash_map.h
index 1d4176eb10..2c7c64cd78 100644
--- a/core/templates/oa_hash_map.h
+++ b/core/templates/oa_hash_map.h
@@ -32,7 +32,6 @@
 #define OA_HASH_MAP_H
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/hashfuncs.h"
 
diff --git a/core/templates/vector.h b/core/templates/vector.h
index a56a941dbc..dae8874a87 100644
--- a/core/templates/vector.h
+++ b/core/templates/vector.h
@@ -38,7 +38,6 @@
 */
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/cowdata.h"
 #include "core/templates/sort_array.h"
@@ -66,6 +65,7 @@ private:
 public:
 	bool push_back(T p_elem);
 	_FORCE_INLINE_ bool append(const T &p_elem) { return push_back(p_elem); } //alias
+	void fill(T p_elem);
 
 	void remove(int p_index) { _cowdata.remove(p_index); }
 	void erase(const T &p_val) {
@@ -134,7 +134,7 @@ public:
 	Vector<uint8_t> to_byte_array() const {
 		Vector<uint8_t> ret;
 		ret.resize(size() * sizeof(T));
-		copymem(ret.ptrw(), ptr(), sizeof(T) * size());
+		memcpy(ret.ptrw(), ptr(), sizeof(T) * size());
 		return ret;
 	}
 
@@ -223,4 +223,12 @@ bool Vector<T>::push_back(T p_elem) {
 	return false;
 }
 
+template <class T>
+void Vector<T>::fill(T p_elem) {
+	T *p = ptrw();
+	for (int i = 0; i < size(); i++) {
+		p[i] = p_elem;
+	}
+}
+
 #endif // VECTOR_H
diff --git a/core/variant/array.cpp b/core/variant/array.cpp
index 2ad728ec5e..2fb2dd4a30 100644
--- a/core/variant/array.cpp
+++ b/core/variant/array.cpp
@@ -208,6 +208,11 @@ void Array::insert(int p_pos, const Variant &p_value) {
 	_p->array.insert(p_pos, p_value);
 }
 
+void Array::fill(const Variant &p_value) {
+	ERR_FAIL_COND(!_p->typed.validate(p_value, "fill"));
+	_p->array.fill(p_value);
+}
+
 void Array::erase(const Variant &p_value) {
 	ERR_FAIL_COND(!_p->typed.validate(p_value, "erase"));
 	_p->array.erase(p_value);
diff --git a/core/variant/array.h b/core/variant/array.h
index 6b58ed12cb..5ce977ee4b 100644
--- a/core/variant/array.h
+++ b/core/variant/array.h
@@ -74,6 +74,7 @@ public:
 
 	void insert(int p_pos, const Variant &p_value);
 	void remove(int p_pos);
+	void fill(const Variant &p_value);
 
 	Variant front() const;
 	Variant back() const;
diff --git a/core/variant/binder_common.h b/core/variant/binder_common.h
index 86bbf43266..830e0a5cbd 100644
--- a/core/variant/binder_common.h
+++ b/core/variant/binder_common.h
@@ -122,6 +122,18 @@ struct VariantObjectClassChecker {
 	}
 };
 
+template <typename T>
+class Ref;
+
+template <typename T>
+struct VariantObjectClassChecker<const Ref<T> &> {
+	static _FORCE_INLINE_ bool check(const Variant &p_variant) {
+		Object *obj = p_variant;
+		const Ref<T> node = p_variant;
+		return node.ptr() || !obj;
+	}
+};
+
 template <>
 struct VariantObjectClassChecker<Node *> {
 	static _FORCE_INLINE_ bool check(const Variant &p_variant) {
diff --git a/core/variant/variant_call.cpp b/core/variant/variant_call.cpp
index 7f83e27dfe..deaccc6304 100644
--- a/core/variant/variant_call.cpp
+++ b/core/variant/variant_call.cpp
@@ -500,7 +500,7 @@ struct _VariantCall {
 			const uint8_t *r = p_instance->ptr();
 			CharString cs;
 			cs.resize(p_instance->size() + 1);
-			copymem(cs.ptrw(), r, p_instance->size());
+			memcpy(cs.ptrw(), r, p_instance->size());
 			cs[p_instance->size()] = 0;
 
 			s = cs.get_data();
@@ -1647,6 +1647,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(Array, resize, sarray("size"), varray());
 	bind_method(Array, insert, sarray("position", "value"), varray());
 	bind_method(Array, remove, sarray("position"), varray());
+	bind_method(Array, fill, sarray("value"), varray());
 	bind_method(Array, erase, sarray("value"), varray());
 	bind_method(Array, front, sarray(), varray());
 	bind_method(Array, back, sarray(), varray());
@@ -1677,6 +1678,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedByteArray, append_array, sarray("array"), varray());
 	bind_method(PackedByteArray, remove, sarray("index"), varray());
 	bind_method(PackedByteArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedByteArray, fill, sarray("value"), varray());
 	bind_method(PackedByteArray, resize, sarray("new_size"), varray());
 	bind_method(PackedByteArray, has, sarray("value"), varray());
 	bind_method(PackedByteArray, reverse, sarray(), varray());
@@ -1731,6 +1733,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedInt32Array, append_array, sarray("array"), varray());
 	bind_method(PackedInt32Array, remove, sarray("index"), varray());
 	bind_method(PackedInt32Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedInt32Array, fill, sarray("value"), varray());
 	bind_method(PackedInt32Array, resize, sarray("new_size"), varray());
 	bind_method(PackedInt32Array, has, sarray("value"), varray());
 	bind_method(PackedInt32Array, reverse, sarray(), varray());
@@ -1749,6 +1752,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedInt64Array, append_array, sarray("array"), varray());
 	bind_method(PackedInt64Array, remove, sarray("index"), varray());
 	bind_method(PackedInt64Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedInt64Array, fill, sarray("value"), varray());
 	bind_method(PackedInt64Array, resize, sarray("new_size"), varray());
 	bind_method(PackedInt64Array, has, sarray("value"), varray());
 	bind_method(PackedInt64Array, reverse, sarray(), varray());
@@ -1767,6 +1771,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedFloat32Array, append_array, sarray("array"), varray());
 	bind_method(PackedFloat32Array, remove, sarray("index"), varray());
 	bind_method(PackedFloat32Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedFloat32Array, fill, sarray("value"), varray());
 	bind_method(PackedFloat32Array, resize, sarray("new_size"), varray());
 	bind_method(PackedFloat32Array, has, sarray("value"), varray());
 	bind_method(PackedFloat32Array, reverse, sarray(), varray());
@@ -1785,6 +1790,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedFloat64Array, append_array, sarray("array"), varray());
 	bind_method(PackedFloat64Array, remove, sarray("index"), varray());
 	bind_method(PackedFloat64Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedFloat64Array, fill, sarray("value"), varray());
 	bind_method(PackedFloat64Array, resize, sarray("new_size"), varray());
 	bind_method(PackedFloat64Array, has, sarray("value"), varray());
 	bind_method(PackedFloat64Array, reverse, sarray(), varray());
@@ -1803,6 +1809,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedStringArray, append_array, sarray("array"), varray());
 	bind_method(PackedStringArray, remove, sarray("index"), varray());
 	bind_method(PackedStringArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedStringArray, fill, sarray("value"), varray());
 	bind_method(PackedStringArray, resize, sarray("new_size"), varray());
 	bind_method(PackedStringArray, has, sarray("value"), varray());
 	bind_method(PackedStringArray, reverse, sarray(), varray());
@@ -1821,6 +1828,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedVector2Array, append_array, sarray("array"), varray());
 	bind_method(PackedVector2Array, remove, sarray("index"), varray());
 	bind_method(PackedVector2Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedVector2Array, fill, sarray("value"), varray());
 	bind_method(PackedVector2Array, resize, sarray("new_size"), varray());
 	bind_method(PackedVector2Array, has, sarray("value"), varray());
 	bind_method(PackedVector2Array, reverse, sarray(), varray());
@@ -1839,6 +1847,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedVector3Array, append_array, sarray("array"), varray());
 	bind_method(PackedVector3Array, remove, sarray("index"), varray());
 	bind_method(PackedVector3Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedVector3Array, fill, sarray("value"), varray());
 	bind_method(PackedVector3Array, resize, sarray("new_size"), varray());
 	bind_method(PackedVector3Array, has, sarray("value"), varray());
 	bind_method(PackedVector3Array, reverse, sarray(), varray());
@@ -1857,6 +1866,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedColorArray, append_array, sarray("array"), varray());
 	bind_method(PackedColorArray, remove, sarray("index"), varray());
 	bind_method(PackedColorArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedColorArray, fill, sarray("value"), varray());
 	bind_method(PackedColorArray, resize, sarray("new_size"), varray());
 	bind_method(PackedColorArray, has, sarray("value"), varray());
 	bind_method(PackedColorArray, reverse, sarray(), varray());
diff --git a/core/variant/variant_op.cpp b/core/variant/variant_op.cpp
index e0a3cf4215..6cbc98d14d 100644
--- a/core/variant/variant_op.cpp
+++ b/core/variant/variant_op.cpp
@@ -1365,10 +1365,10 @@ void register_op(Variant::Operator p_op, Variant::Type p_type_a, Variant::Type p
 }
 
 void Variant::_register_variant_operators() {
-	zeromem(operator_return_type_table, sizeof(operator_return_type_table));
-	zeromem(operator_evaluator_table, sizeof(operator_evaluator_table));
-	zeromem(validated_operator_evaluator_table, sizeof(validated_operator_evaluator_table));
-	zeromem(ptr_operator_evaluator_table, sizeof(ptr_operator_evaluator_table));
+	memset(operator_return_type_table, 0, sizeof(operator_return_type_table));
+	memset(operator_evaluator_table, 0, sizeof(operator_evaluator_table));
+	memset(validated_operator_evaluator_table, 0, sizeof(validated_operator_evaluator_table));
+	memset(ptr_operator_evaluator_table, 0, sizeof(ptr_operator_evaluator_table));
 
 	register_op<OperatorEvaluatorAdd<int64_t, int64_t, int64_t>>(Variant::OP_ADD, Variant::INT, Variant::INT);
 	register_op<OperatorEvaluatorAdd<double, int64_t, double>>(Variant::OP_ADD, Variant::INT, Variant::FLOAT);
diff --git a/doc/classes/Array.xml b/doc/classes/Array.xml
index 54bbe7a94b..38b74cb436 100644
--- a/doc/classes/Array.xml
+++ b/doc/classes/Array.xml
@@ -237,6 +237,27 @@
 				[b]Note:[/b] On large arrays, this method will be slower if the removed element is close to the beginning of the array (index 0). This is because all elements placed after the removed element have to be reindexed.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Variant">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements:
+				[codeblocks]
+				[gdscript]
+				var array = []
+				array.resize(10)
+				array.fill(0) # Initialize the 10 elements to 0.
+				[/gdscript]
+				[csharp]
+				var array = new Godot.Collections.Array{};
+				array.Resize(10);
+				array.Fill(0); // Initialize the 10 elements to 0.
+				[/csharp]
+				[/codeblocks]
+			</description>
+		</method>
 		<method name="find" qualifiers="const">
 			<return type="int">
 			</return>
diff --git a/doc/classes/GeometryInstance3D.xml b/doc/classes/GeometryInstance3D.xml
index 631a30abab..b2c3bfc3ed 100644
--- a/doc/classes/GeometryInstance3D.xml
+++ b/doc/classes/GeometryInstance3D.xml
@@ -48,6 +48,8 @@
 		</member>
 		<member name="gi_mode" type="int" setter="set_gi_mode" getter="get_gi_mode" enum="GeometryInstance3D.GIMode" default="0">
 		</member>
+		<member name="ignore_occlusion_culling" type="bool" setter="set_ignore_occlusion_culling" getter="is_ignoring_occlusion_culling" default="false">
+		</member>
 		<member name="lod_bias" type="float" setter="set_lod_bias" getter="get_lod_bias" default="1.0">
 		</member>
 		<member name="lod_max_distance" type="float" setter="set_lod_max_distance" getter="get_lod_max_distance" default="0.0">
diff --git a/doc/classes/Occluder3D.xml b/doc/classes/Occluder3D.xml
new file mode 100644
index 0000000000..fc676c2b49
--- /dev/null
+++ b/doc/classes/Occluder3D.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="Occluder3D" inherits="Resource" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+	</methods>
+	<members>
+		<member name="indices" type="PackedInt32Array" setter="set_indices" getter="get_indices" default="PackedInt32Array(  )">
+		</member>
+		<member name="vertices" type="PackedVector3Array" setter="set_vertices" getter="get_vertices" default="PackedVector3Array(  )">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
diff --git a/doc/classes/OccluderInstance3D.xml b/doc/classes/OccluderInstance3D.xml
new file mode 100644
index 0000000000..76b784d21d
--- /dev/null
+++ b/doc/classes/OccluderInstance3D.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="OccluderInstance3D" inherits="Node3D" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+		<method name="get_bake_mask_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="set_bake_mask_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<argument index="1" name="enabled" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
+	</methods>
+	<members>
+		<member name="bake_mask" type="int" setter="set_bake_mask" getter="get_bake_mask" default="4294967295">
+		</member>
+		<member name="occluder" type="Occluder3D" setter="set_occluder" getter="get_occluder">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
diff --git a/doc/classes/PackedByteArray.xml b/doc/classes/PackedByteArray.xml
index 24178c3ff6..0652cf0aa1 100644
--- a/doc/classes/PackedByteArray.xml
+++ b/doc/classes/PackedByteArray.xml
@@ -322,6 +322,15 @@
 			<description>
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="get_string_from_ascii" qualifiers="const">
 			<return type="String">
 			</return>
diff --git a/doc/classes/PackedColorArray.xml b/doc/classes/PackedColorArray.xml
index 38240b3154..19cfcd7c87 100644
--- a/doc/classes/PackedColorArray.xml
+++ b/doc/classes/PackedColorArray.xml
@@ -59,6 +59,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Color">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedFloat32Array.xml b/doc/classes/PackedFloat32Array.xml
index 5e0008852c..ab97c9a695 100644
--- a/doc/classes/PackedFloat32Array.xml
+++ b/doc/classes/PackedFloat32Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="float">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedFloat64Array.xml b/doc/classes/PackedFloat64Array.xml
index fb7817cb41..ad20801b01 100644
--- a/doc/classes/PackedFloat64Array.xml
+++ b/doc/classes/PackedFloat64Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="float">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedInt32Array.xml b/doc/classes/PackedInt32Array.xml
index 4ee428dfbc..ff4729082e 100644
--- a/doc/classes/PackedInt32Array.xml
+++ b/doc/classes/PackedInt32Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedInt64Array.xml b/doc/classes/PackedInt64Array.xml
index 51948fcbc8..195b12b129 100644
--- a/doc/classes/PackedInt64Array.xml
+++ b/doc/classes/PackedInt64Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedStringArray.xml b/doc/classes/PackedStringArray.xml
index 9748301dae..22458832da 100644
--- a/doc/classes/PackedStringArray.xml
+++ b/doc/classes/PackedStringArray.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="String">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedVector2Array.xml b/doc/classes/PackedVector2Array.xml
index 1b3201b072..6c8791f988 100644
--- a/doc/classes/PackedVector2Array.xml
+++ b/doc/classes/PackedVector2Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Vector2">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedVector3Array.xml b/doc/classes/PackedVector3Array.xml
index 25d854016a..85d41d7519 100644
--- a/doc/classes/PackedVector3Array.xml
+++ b/doc/classes/PackedVector3Array.xml
@@ -59,6 +59,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Vector3">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 2bfe7ad48f..e090e20d9f 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -1469,6 +1469,12 @@
 		</member>
 		<member name="rendering/mesh_lod/lod_change/threshold_pixels" type="float" setter="" getter="" default="1.0">
 		</member>
+		<member name="rendering/occlusion_culling/bvh_build_quality" type="int" setter="" getter="" default="2">
+		</member>
+		<member name="rendering/occlusion_culling/occlusion_rays_per_thread" type="int" setter="" getter="" default="512">
+		</member>
+		<member name="rendering/occlusion_culling/use_occlusion_culling" type="bool" setter="" getter="" default="false">
+		</member>
 		<member name="rendering/reflections/reflection_atlas/reflection_count" type="int" setter="" getter="" default="64">
 			Number of cubemaps to store in the reflection atlas. The number of [ReflectionProbe]s in a scene will be limited by this amount. A higher number requires more VRAM.
 		</member>
diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml
index d6eaa1b88b..638b0bb297 100644
--- a/doc/classes/RenderingServer.xml
+++ b/doc/classes/RenderingServer.xml
@@ -1999,6 +1999,24 @@
 				Sets the number of instances visible at a given time. If -1, all instances that have been allocated are drawn. Equivalent to [member MultiMesh.visible_instance_count].
 			</description>
 		</method>
+		<method name="occluder_create">
+			<return type="RID">
+			</return>
+			<description>
+			</description>
+		</method>
+		<method name="occluder_set_mesh">
+			<return type="void">
+			</return>
+			<argument index="0" name="arg0" type="RID">
+			</argument>
+			<argument index="1" name="arg1" type="PackedVector3Array">
+			</argument>
+			<argument index="2" name="arg2" type="PackedInt32Array">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="omni_light_create">
 			<return type="RID">
 			</return>
@@ -2412,6 +2430,16 @@
 				The scenario is the 3D world that all the visual instances exist in.
 			</description>
 		</method>
+		<method name="scenario_set_camera_effects">
+			<return type="void">
+			</return>
+			<argument index="0" name="scenario" type="RID">
+			</argument>
+			<argument index="1" name="effects" type="RID">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="scenario_set_debug">
 			<return type="void">
 			</return>
@@ -2897,6 +2925,22 @@
 				Sets the anti-aliasing mode. See [enum ViewportMSAA] for options.
 			</description>
 		</method>
+		<method name="viewport_set_occlusion_culling_build_quality">
+			<return type="void">
+			</return>
+			<argument index="0" name="quality" type="int" enum="RenderingServer.ViewportOcclusionCullingBuildQuality">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="viewport_set_occlusion_rays_per_thread">
+			<return type="void">
+			</return>
+			<argument index="0" name="rays_per_thread" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_parent_viewport">
 			<return type="void">
 			</return>
@@ -3002,6 +3046,16 @@
 			<description>
 			</description>
 		</method>
+		<method name="viewport_set_use_occlusion_culling">
+			<return type="void">
+			</return>
+			<argument index="0" name="viewport" type="RID">
+			</argument>
+			<argument index="1" name="enable" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_use_xr">
 			<return type="void">
 			</return>
@@ -3454,6 +3508,8 @@
 		</constant>
 		<constant name="VIEWPORT_DEBUG_DRAW_GI_BUFFER" value="17" enum="ViewportDebugDraw">
 		</constant>
+		<constant name="VIEWPORT_DEBUG_DRAW_OCCLUDERS" value="23" enum="ViewportDebugDraw">
+		</constant>
 		<constant name="SKY_MODE_QUALITY" value="1" enum="SkyMode">
 			Uses high quality importance sampling to process the radiance map. In general, this results in much higher quality than [constant Sky.PROCESS_MODE_REALTIME] but takes much longer to generate. This should not be used if you plan on changing the sky at runtime. If you are finding that the reflection is not blurry enough and is showing sparkles or fireflies, try increasing [member ProjectSettings.rendering/reflections/sky_reflections/ggx_samples].
 		</constant>
@@ -3606,6 +3662,12 @@
 		<constant name="SCENARIO_DEBUG_SHADELESS" value="3" enum="ScenarioDebugMode">
 			Draw all objects without shading. Equivalent to setting all objects shaders to [code]unshaded[/code].
 		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW" value="0" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM" value="1" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH" value="2" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
 		<constant name="INSTANCE_NONE" value="0" enum="InstanceType">
 			The instance does not have a type.
 		</constant>
@@ -3638,7 +3700,9 @@
 		<constant name="INSTANCE_LIGHTMAP" value="10" enum="InstanceType">
 			The instance is a lightmap.
 		</constant>
-		<constant name="INSTANCE_MAX" value="11" enum="InstanceType">
+		<constant name="INSTANCE_OCCLUDER" value="11" enum="InstanceType">
+		</constant>
+		<constant name="INSTANCE_MAX" value="12" enum="InstanceType">
 			Represents the size of the [enum InstanceType] enum.
 		</constant>
 		<constant name="INSTANCE_GEOMETRY_MASK" value="30" enum="InstanceType">
@@ -3653,7 +3717,9 @@
 		<constant name="INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE" value="2" enum="InstanceFlags">
 			When set, manually requests to draw geometry on next frame.
 		</constant>
-		<constant name="INSTANCE_FLAG_MAX" value="3" enum="InstanceFlags">
+		<constant name="INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING" value="3" enum="InstanceFlags">
+		</constant>
+		<constant name="INSTANCE_FLAG_MAX" value="4" enum="InstanceFlags">
 			Represents the size of the [enum InstanceFlags] enum.
 		</constant>
 		<constant name="SHADOW_CASTING_SETTING_OFF" value="0" enum="ShadowCastingSetting">
diff --git a/doc/classes/Tabs.xml b/doc/classes/Tabs.xml
index 79fa8896e3..d784585e20 100644
--- a/doc/classes/Tabs.xml
+++ b/doc/classes/Tabs.xml
@@ -389,8 +389,6 @@
 		<theme_item name="outline_size" type="int" default="0">
 			The size of the tab text outline.
 		</theme_item>
-		<theme_item name="panel" type="StyleBox">
-		</theme_item>
 		<theme_item name="tab_disabled" type="StyleBox">
 			The style of disabled tabs.
 		</theme_item>
diff --git a/doc/classes/Viewport.xml b/doc/classes/Viewport.xml
index 471d21374d..cce5705379 100644
--- a/doc/classes/Viewport.xml
+++ b/doc/classes/Viewport.xml
@@ -267,6 +267,8 @@
 		</member>
 		<member name="use_debanding" type="bool" setter="set_use_debanding" getter="is_using_debanding" default="false">
 		</member>
+		<member name="use_occlusion_culling" type="bool" setter="set_use_occlusion_culling" getter="is_using_occlusion_culling" default="false">
+		</member>
 		<member name="world_2d" type="World2D" setter="set_world_2d" getter="get_world_2d">
 			The custom [World2D] which can be used as 2D environment source.
 		</member>
@@ -419,6 +421,8 @@
 		</constant>
 		<constant name="DEBUG_DRAW_CLUSTER_REFLECTION_PROBES" value="22" enum="DebugDraw">
 		</constant>
+		<constant name="DEBUG_DRAW_OCCLUDERS" value="23" enum="DebugDraw">
+		</constant>
 		<constant name="DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST" value="0" enum="DefaultCanvasItemTextureFilter">
 			The texture filter reads from the nearest pixel only. The simplest and fastest method of filtering, but the texture will look pixelized.
 		</constant>
diff --git a/drivers/coreaudio/audio_driver_coreaudio.cpp b/drivers/coreaudio/audio_driver_coreaudio.cpp
index f40036d628..4139727422 100644
--- a/drivers/coreaudio/audio_driver_coreaudio.cpp
+++ b/drivers/coreaudio/audio_driver_coreaudio.cpp
@@ -70,7 +70,7 @@ OSStatus AudioDriverCoreAudio::output_device_address_cb(AudioObjectID inObjectID
 
 Error AudioDriverCoreAudio::init() {
 	AudioComponentDescription desc;
-	zeromem(&desc, sizeof(desc));
+	memset(&desc, 0, sizeof(desc));
 	desc.componentType = kAudioUnitType_Output;
 #ifdef OSX_ENABLED
 	desc.componentSubType = kAudioUnitSubType_HALOutput;
@@ -97,7 +97,7 @@ Error AudioDriverCoreAudio::init() {
 
 	AudioStreamBasicDescription strdesc;
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	UInt32 size = sizeof(strdesc);
 	result = AudioUnitGetProperty(audio_unit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, kOutputBus, &strdesc, &size);
 	ERR_FAIL_COND_V(result != noErr, FAILED);
@@ -118,7 +118,7 @@ Error AudioDriverCoreAudio::init() {
 
 	mix_rate = GLOBAL_GET("audio/driver/mix_rate");
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	strdesc.mFormatID = kAudioFormatLinearPCM;
 	strdesc.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;
 	strdesc.mChannelsPerFrame = channels;
@@ -148,7 +148,7 @@ Error AudioDriverCoreAudio::init() {
 	print_verbose("CoreAudio: audio buffer frames: " + itos(buffer_frames) + " calculated latency: " + itos(buffer_frames * 1000 / mix_rate) + "ms");
 
 	AURenderCallbackStruct callback;
-	zeromem(&callback, sizeof(AURenderCallbackStruct));
+	memset(&callback, 0, sizeof(AURenderCallbackStruct));
 	callback.inputProc = &AudioDriverCoreAudio::output_callback;
 	callback.inputProcRefCon = this;
 	result = AudioUnitSetProperty(audio_unit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, kOutputBus, &callback, sizeof(callback));
@@ -173,7 +173,7 @@ OSStatus AudioDriverCoreAudio::output_callback(void *inRefCon,
 	if (!ad->active || !ad->try_lock()) {
 		for (unsigned int i = 0; i < ioData->mNumberBuffers; i++) {
 			AudioBuffer *abuf = &ioData->mBuffers[i];
-			zeromem(abuf->mData, abuf->mDataByteSize);
+			memset(abuf->mData, 0, abuf->mDataByteSize);
 		};
 		return 0;
 	};
@@ -293,7 +293,7 @@ void AudioDriverCoreAudio::finish() {
 		lock();
 
 		AURenderCallbackStruct callback;
-		zeromem(&callback, sizeof(AURenderCallbackStruct));
+		memset(&callback, 0, sizeof(AURenderCallbackStruct));
 		result = AudioUnitSetProperty(audio_unit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, kOutputBus, &callback, sizeof(callback));
 		if (result != noErr) {
 			ERR_PRINT("AudioUnitSetProperty failed");
@@ -337,7 +337,7 @@ void AudioDriverCoreAudio::finish() {
 
 Error AudioDriverCoreAudio::capture_init() {
 	AudioComponentDescription desc;
-	zeromem(&desc, sizeof(desc));
+	memset(&desc, 0, sizeof(desc));
 	desc.componentType = kAudioUnitType_Output;
 #ifdef OSX_ENABLED
 	desc.componentSubType = kAudioUnitSubType_HALOutput;
@@ -383,7 +383,7 @@ Error AudioDriverCoreAudio::capture_init() {
 #endif
 
 	AudioStreamBasicDescription strdesc;
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	size = sizeof(strdesc);
 	result = AudioUnitGetProperty(input_unit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, kInputBus, &strdesc, &size);
 	ERR_FAIL_COND_V(result != noErr, FAILED);
@@ -405,7 +405,7 @@ Error AudioDriverCoreAudio::capture_init() {
 
 	mix_rate = GLOBAL_GET("audio/driver/mix_rate");
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	strdesc.mFormatID = kAudioFormatLinearPCM;
 	strdesc.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;
 	strdesc.mChannelsPerFrame = capture_channels;
@@ -419,7 +419,7 @@ Error AudioDriverCoreAudio::capture_init() {
 	ERR_FAIL_COND_V(result != noErr, FAILED);
 
 	AURenderCallbackStruct callback;
-	zeromem(&callback, sizeof(AURenderCallbackStruct));
+	memset(&callback, 0, sizeof(AURenderCallbackStruct));
 	callback.inputProc = &AudioDriverCoreAudio::input_callback;
 	callback.inputProcRefCon = this;
 	result = AudioUnitSetProperty(input_unit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, kInputBus, &callback, sizeof(callback));
@@ -436,7 +436,7 @@ void AudioDriverCoreAudio::capture_finish() {
 		lock();
 
 		AURenderCallbackStruct callback;
-		zeromem(&callback, sizeof(AURenderCallbackStruct));
+		memset(&callback, 0, sizeof(AURenderCallbackStruct));
 		OSStatus result = AudioUnitSetProperty(input_unit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, 0, &callback, sizeof(callback));
 		if (result != noErr) {
 			ERR_PRINT("AudioUnitSetProperty failed");
diff --git a/drivers/dummy/rasterizer_dummy.h b/drivers/dummy/rasterizer_dummy.h
index 9d6be1a802..f9a76d1603 100644
--- a/drivers/dummy/rasterizer_dummy.h
+++ b/drivers/dummy/rasterizer_dummy.h
@@ -548,6 +548,12 @@ public:
 	void lightmap_set_probe_capture_update_speed(float p_speed) override {}
 	float lightmap_get_probe_capture_update_speed() const override { return 0; }
 
+	/* OCCLUDER */
+
+	RID occluder_allocate() override { return RID(); }
+	void occluder_initialize(RID p_rid) override {}
+	void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {}
+
 	/* PARTICLES */
 
 	RID particles_allocate() override { return RID(); }
diff --git a/drivers/png/image_loader_png.cpp b/drivers/png/image_loader_png.cpp
index 854c6706e6..ded6bbc53e 100644
--- a/drivers/png/image_loader_png.cpp
+++ b/drivers/png/image_loader_png.cpp
@@ -88,7 +88,7 @@ Vector<uint8_t> ImageLoaderPNG::lossless_pack_png(const Ref<Image> &p_image) {
 	{
 		// must be closed before call to image_to_png
 		uint8_t *writer = out_buffer.ptrw();
-		copymem(writer, "PNG ", 4);
+		memcpy(writer, "PNG ", 4);
 	}
 
 	Error err = PNGDriverCommon::image_to_png(p_image, out_buffer);
diff --git a/drivers/png/png_driver_common.cpp b/drivers/png/png_driver_common.cpp
index 9e848a2253..412e17c6b7 100644
--- a/drivers/png/png_driver_common.cpp
+++ b/drivers/png/png_driver_common.cpp
@@ -60,7 +60,7 @@ static bool check_error(const png_image &image) {
 
 Error png_to_image(const uint8_t *p_source, size_t p_size, bool p_force_linear, Ref<Image> p_image) {
 	png_image png_img;
-	zeromem(&png_img, sizeof(png_img));
+	memset(&png_img, 0, sizeof(png_img));
 	png_img.version = PNG_IMAGE_VERSION;
 
 	// fetch image properties
@@ -134,7 +134,7 @@ Error image_to_png(const Ref<Image> &p_image, Vector<uint8_t> &p_buffer) {
 	ERR_FAIL_COND_V(source_image->is_compressed(), FAILED);
 
 	png_image png_img;
-	zeromem(&png_img, sizeof(png_img));
+	memset(&png_img, 0, sizeof(png_img));
 	png_img.version = PNG_IMAGE_VERSION;
 	png_img.width = source_image->get_width();
 	png_img.height = source_image->get_height();
diff --git a/drivers/unix/net_socket_posix.cpp b/drivers/unix/net_socket_posix.cpp
index bbf96d8239..e2ad352c10 100644
--- a/drivers/unix/net_socket_posix.cpp
+++ b/drivers/unix/net_socket_posix.cpp
@@ -106,7 +106,7 @@ size_t NetSocketPosix::_set_addr_storage(struct sockaddr_storage *p_addr, const
 		addr6->sin6_family = AF_INET6;
 		addr6->sin6_port = htons(p_port);
 		if (p_ip.is_valid()) {
-			copymem(&addr6->sin6_addr.s6_addr, p_ip.get_ipv6(), 16);
+			memcpy(&addr6->sin6_addr.s6_addr, p_ip.get_ipv6(), 16);
 		} else {
 			addr6->sin6_addr = in6addr_any;
 		}
@@ -121,7 +121,7 @@ size_t NetSocketPosix::_set_addr_storage(struct sockaddr_storage *p_addr, const
 		addr4->sin_port = htons(p_port); // short, network byte order
 
 		if (p_ip.is_valid()) {
-			copymem(&addr4->sin_addr.s_addr, p_ip.get_ipv4(), 4);
+			memcpy(&addr4->sin_addr.s_addr, p_ip.get_ipv4(), 4);
 		} else {
 			addr4->sin_addr.s_addr = INADDR_ANY;
 		}
@@ -283,13 +283,13 @@ _FORCE_INLINE_ Error NetSocketPosix::_change_multicast_group(IP_Address p_ip, St
 		ERR_FAIL_COND_V(!if_ip.is_valid(), ERR_INVALID_PARAMETER);
 		struct ip_mreq greq;
 		int sock_opt = p_add ? IP_ADD_MEMBERSHIP : IP_DROP_MEMBERSHIP;
-		copymem(&greq.imr_multiaddr, p_ip.get_ipv4(), 4);
-		copymem(&greq.imr_interface, if_ip.get_ipv4(), 4);
+		memcpy(&greq.imr_multiaddr, p_ip.get_ipv4(), 4);
+		memcpy(&greq.imr_interface, if_ip.get_ipv4(), 4);
 		ret = setsockopt(_sock, level, sock_opt, (const char *)&greq, sizeof(greq));
 	} else {
 		struct ipv6_mreq greq;
 		int sock_opt = p_add ? IPV6_ADD_MEMBERSHIP : IPV6_DROP_MEMBERSHIP;
-		copymem(&greq.ipv6mr_multiaddr, p_ip.get_ipv6(), 16);
+		memcpy(&greq.ipv6mr_multiaddr, p_ip.get_ipv6(), 16);
 		greq.ipv6mr_interface = if_v6id;
 		ret = setsockopt(_sock, level, sock_opt, (const char *)&greq, sizeof(greq));
 	}
diff --git a/drivers/vulkan/rendering_device_vulkan.cpp b/drivers/vulkan/rendering_device_vulkan.cpp
index 09e2b4546a..b664ccdd3c 100644
--- a/drivers/vulkan/rendering_device_vulkan.cpp
+++ b/drivers/vulkan/rendering_device_vulkan.cpp
@@ -1600,7 +1600,7 @@ Error RenderingDeviceVulkan::_buffer_update(Buffer *p_buffer, size_t p_offset, c
 		}
 
 		//copy to staging buffer
-		copymem(((uint8_t *)data_ptr) + block_write_offset, p_data + submit_from, block_write_amount);
+		memcpy(((uint8_t *)data_ptr) + block_write_offset, p_data + submit_from, block_write_amount);
 
 		//unmap
 		vmaUnmapMemory(allocator, staging_buffer_blocks[staging_buffer_current].allocation);
@@ -2558,7 +2558,7 @@ Vector<uint8_t> RenderingDeviceVulkan::_texture_get_data_from_image(Texture *tex
 						const uint8_t *rptr = slice_read_ptr + y * layout.rowPitch;
 						uint8_t *wptr = write_ptr + y * line_width;
 
-						copymem(wptr, rptr, line_width);
+						memcpy(wptr, rptr, line_width);
 					}
 
 				} else {
@@ -2566,7 +2566,7 @@ Vector<uint8_t> RenderingDeviceVulkan::_texture_get_data_from_image(Texture *tex
 					for (uint32_t y = 0; y < height; y++) {
 						const uint8_t *rptr = slice_read_ptr + y * layout.rowPitch;
 						uint8_t *wptr = write_ptr + y * pixel_size * width;
-						copymem(wptr, rptr, (uint64_t)pixel_size * width);
+						memcpy(wptr, rptr, (uint64_t)pixel_size * width);
 					}
 				}
 			}
@@ -2699,7 +2699,7 @@ Vector<uint8_t> RenderingDeviceVulkan::texture_get_data(RID p_texture, uint32_t
 		{
 			buffer_data.resize(buffer_size);
 			uint8_t *w = buffer_data.ptrw();
-			copymem(w, buffer_mem, buffer_size);
+			memcpy(w, buffer_mem, buffer_size);
 		}
 
 		vmaUnmapMemory(allocator, tmp_buffer.allocation);
@@ -5359,7 +5359,7 @@ Vector<uint8_t> RenderingDeviceVulkan::buffer_get_data(RID p_buffer) {
 	{
 		buffer_data.resize(buffer->size);
 		uint8_t *w = buffer_data.ptrw();
-		copymem(w, buffer_mem, buffer->size);
+		memcpy(w, buffer_mem, buffer->size);
 	}
 
 	vmaUnmapMemory(allocator, tmp_buffer.allocation);
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 055baeb81e..6137617564 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -143,6 +143,7 @@
 #include "editor/plugins/multimesh_editor_plugin.h"
 #include "editor/plugins/navigation_polygon_editor_plugin.h"
 #include "editor/plugins/node_3d_editor_plugin.h"
+#include "editor/plugins/occluder_instance_3d_editor_plugin.h"
 #include "editor/plugins/ot_features_plugin.h"
 #include "editor/plugins/packed_scene_translation_parser_plugin.h"
 #include "editor/plugins/path_2d_editor_plugin.h"
@@ -6800,6 +6801,7 @@ EditorNode::EditorNode() {
 	add_editor_plugin(memnew(TextureRegionEditorPlugin(this)));
 	add_editor_plugin(memnew(GIProbeEditorPlugin(this)));
 	add_editor_plugin(memnew(BakedLightmapEditorPlugin(this)));
+	add_editor_plugin(memnew(OccluderInstance3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path2DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Line2DEditorPlugin(this)));
diff --git a/editor/node_3d_editor_gizmos.cpp b/editor/node_3d_editor_gizmos.cpp
index 7dcabafece..afafd7d195 100644
--- a/editor/node_3d_editor_gizmos.cpp
+++ b/editor/node_3d_editor_gizmos.cpp
@@ -47,6 +47,7 @@
 #include "scene/3d/listener_3d.h"
 #include "scene/3d/mesh_instance_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/physics_joint_3d.h"
 #include "scene/3d/position_3d.h"
 #include "scene/3d/ray_cast_3d.h"
@@ -176,6 +177,7 @@ void EditorNode3DGizmo::Instance::create_instance(Node3D *p_base, bool p_hidden)
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(instance, RS::SHADOW_CASTING_SETTING_OFF);
 	int layer = p_hidden ? 0 : 1 << Node3DEditorViewport::GIZMO_EDIT_LAYER;
 	RS::get_singleton()->instance_set_layer_mask(instance, layer); //gizmos are 26
+	RS::get_singleton()->instance_geometry_set_flag(instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }
 
 void EditorNode3DGizmo::add_mesh(const Ref<ArrayMesh> &p_mesh, bool p_billboard, const Ref<SkinReference> &p_skin_reference, const Ref<Material> &p_material) {
@@ -1464,6 +1466,44 @@ void MeshInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
 }
 
 /////
+
+OccluderInstance3DGizmoPlugin::OccluderInstance3DGizmoPlugin() {
+	create_material("line_material", EDITOR_DEF("editors/3d_gizmos/gizmo_colors/occluder", Color(0.8, 0.5, 1)));
+}
+
+bool OccluderInstance3DGizmoPlugin::has_gizmo(Node3D *p_spatial) {
+	return Object::cast_to<OccluderInstance3D>(p_spatial) != nullptr;
+}
+
+String OccluderInstance3DGizmoPlugin::get_gizmo_name() const {
+	return "OccluderInstance3D";
+}
+
+int OccluderInstance3DGizmoPlugin::get_priority() const {
+	return -1;
+}
+
+void OccluderInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
+	OccluderInstance3D *occluder_instance = Object::cast_to<OccluderInstance3D>(p_gizmo->get_spatial_node());
+
+	p_gizmo->clear();
+
+	Ref<Occluder3D> o = occluder_instance->get_occluder();
+
+	if (!o.is_valid()) {
+		return;
+	}
+
+	Vector<Vector3> lines = o->get_debug_lines();
+	if (!lines.is_empty()) {
+		Ref<Material> material = get_material("line_material", p_gizmo);
+		p_gizmo->add_lines(lines, material);
+		p_gizmo->add_collision_segments(lines);
+	}
+}
+
+/////
+
 Sprite3DGizmoPlugin::Sprite3DGizmoPlugin() {
 }
 
diff --git a/editor/node_3d_editor_gizmos.h b/editor/node_3d_editor_gizmos.h
index 6f98d3a08c..95344176ad 100644
--- a/editor/node_3d_editor_gizmos.h
+++ b/editor/node_3d_editor_gizmos.h
@@ -100,6 +100,18 @@ public:
 	MeshInstance3DGizmoPlugin();
 };
 
+class OccluderInstance3DGizmoPlugin : public EditorNode3DGizmoPlugin {
+	GDCLASS(OccluderInstance3DGizmoPlugin, EditorNode3DGizmoPlugin);
+
+public:
+	bool has_gizmo(Node3D *p_spatial) override;
+	String get_gizmo_name() const override;
+	int get_priority() const override;
+	void redraw(EditorNode3DGizmo *p_gizmo) override;
+
+	OccluderInstance3DGizmoPlugin();
+};
+
 class Sprite3DGizmoPlugin : public EditorNode3DGizmoPlugin {
 	GDCLASS(Sprite3DGizmoPlugin, EditorNode3DGizmoPlugin);
 
diff --git a/editor/plugins/gpu_particles_3d_editor_plugin.cpp b/editor/plugins/gpu_particles_3d_editor_plugin.cpp
index 433a5ae51c..89d6aaa5f9 100644
--- a/editor/plugins/gpu_particles_3d_editor_plugin.cpp
+++ b/editor/plugins/gpu_particles_3d_editor_plugin.cpp
@@ -346,7 +346,7 @@ void GPUParticles3DEditor::_generate_emission_points() {
 
 	{
 		uint8_t *iw = point_img.ptrw();
-		zeromem(iw, w * h * 3 * sizeof(float));
+		memset(iw, 0, w * h * 3 * sizeof(float));
 		const Vector3 *r = points.ptr();
 		float *wf = (float *)iw;
 		for (int i = 0; i < point_count; i++) {
@@ -374,7 +374,7 @@ void GPUParticles3DEditor::_generate_emission_points() {
 
 		{
 			uint8_t *iw = point_img2.ptrw();
-			zeromem(iw, w * h * 3 * sizeof(float));
+			memset(iw, 0, w * h * 3 * sizeof(float));
 			const Vector3 *r = normals.ptr();
 			float *wf = (float *)iw;
 			for (int i = 0; i < point_count; i++) {
diff --git a/editor/plugins/node_3d_editor_plugin.cpp b/editor/plugins/node_3d_editor_plugin.cpp
index 13c7814dac..023d91be30 100644
--- a/editor/plugins/node_3d_editor_plugin.cpp
+++ b/editor/plugins/node_3d_editor_plugin.cpp
@@ -2368,6 +2368,9 @@ void Node3DEditorViewport::_project_settings_changed() {
 	viewport->set_screen_space_aa(Viewport::ScreenSpaceAA(ssaa_mode));
 	const bool use_debanding = GLOBAL_GET("rendering/anti_aliasing/quality/use_debanding");
 	viewport->set_use_debanding(use_debanding);
+
+	const bool use_occlusion_culling = GLOBAL_GET("rendering/occlusion_culling/use_occlusion_culling");
+	viewport->set_use_occlusion_culling(use_occlusion_culling);
 }
 
 void Node3DEditorViewport::_notification(int p_what) {
@@ -3071,7 +3074,8 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 		case VIEW_DISPLAY_DEBUG_CLUSTER_OMNI_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_DECALS:
-		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: {
+		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES:
+		case VIEW_DISPLAY_DEBUG_OCCLUDERS: {
 			static const int display_options[] = {
 				VIEW_DISPLAY_NORMAL,
 				VIEW_DISPLAY_WIREFRAME,
@@ -3097,6 +3101,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+				VIEW_DISPLAY_DEBUG_OCCLUDERS,
 				VIEW_MAX
 			};
 			static const Viewport::DebugDraw debug_draw_modes[] = {
@@ -3124,6 +3129,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				Viewport::DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 				Viewport::DEBUG_DRAW_CLUSTER_DECALS,
 				Viewport::DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+				Viewport::DEBUG_DRAW_OCCLUDERS,
 			};
 
 			int idx = 0;
@@ -3173,6 +3179,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		move_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(move_plane_gizmo_instance[i], spatial_editor->get_move_plane_gizmo(i)->get_rid());
@@ -3180,6 +3187,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		rotate_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(rotate_gizmo_instance[i], spatial_editor->get_rotate_gizmo(i)->get_rid());
@@ -3187,6 +3195,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		scale_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_gizmo_instance[i], spatial_editor->get_scale_gizmo(i)->get_rid());
@@ -3194,6 +3203,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		scale_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_plane_gizmo_instance[i], spatial_editor->get_scale_plane_gizmo(i)->get_rid());
@@ -3201,6 +3211,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}
 
 	// Rotation white outline
@@ -3210,6 +3221,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 	RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[3], false);
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[3], RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[3], layer);
+	RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[3], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }
 
 void Node3DEditorViewport::_finish_gizmo_instances() {
@@ -4043,6 +4055,7 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, Edito
 	display_submenu->add_radio_check_item(TTR("Spot Light Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS);
 	display_submenu->add_radio_check_item(TTR("Decal Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_DECALS);
 	display_submenu->add_radio_check_item(TTR("Reflection Probe Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES);
+	display_submenu->add_radio_check_item(TTR("Occlusion Culling Buffer"), VIEW_DISPLAY_DEBUG_OCCLUDERS);
 
 	display_submenu->set_name("display_advanced");
 	view_menu->get_popup()->add_submenu_item(TTR("Display Advanced..."), "display_advanced", VIEW_DISPLAY_ADVANCED);
@@ -4625,6 +4638,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	si->sbox_instance_xray = RenderingServer::get_singleton()->instance_create2(
 			selection_box_xray->get_rid(),
 			sp->get_world_3d()->get_scenario());
@@ -4632,6 +4646,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance_xray,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance_xray, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 	return si;
 }
@@ -5403,6 +5418,7 @@ void Node3DEditor::_init_indicators() {
 
 		origin_instance = RenderingServer::get_singleton()->instance_create2(origin, get_tree()->get_root()->get_world_3d()->get_scenario());
 		RS::get_singleton()->instance_set_layer_mask(origin_instance, 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(origin_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(origin_instance, RS::SHADOW_CASTING_SETTING_OFF);
 	}
@@ -5964,6 +5980,7 @@ void Node3DEditor::_init_grid() {
 		RenderingServer::get_singleton()->instance_set_visible(grid_instance[c], grid_visible[a]);
 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(grid_instance[c], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(grid_instance[c], 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(grid_instance[c], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}
 }
 
@@ -6465,6 +6482,7 @@ void Node3DEditor::_register_all_gizmos() {
 	add_gizmo_plugin(Ref<Light3DGizmoPlugin>(memnew(Light3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<AudioStreamPlayer3DGizmoPlugin>(memnew(AudioStreamPlayer3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<MeshInstance3DGizmoPlugin>(memnew(MeshInstance3DGizmoPlugin)));
+	add_gizmo_plugin(Ref<OccluderInstance3DGizmoPlugin>(memnew(OccluderInstance3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<SoftBody3DGizmoPlugin>(memnew(SoftBody3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Sprite3DGizmoPlugin>(memnew(Sprite3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Skeleton3DGizmoPlugin>(memnew(Skeleton3DGizmoPlugin)));
@@ -7340,6 +7358,7 @@ void EditorNode3DGizmoPlugin::create_material(const String &p_name, const Color
 		material->set_shading_mode(StandardMaterial3D::SHADING_MODE_UNSHADED);
 		material->set_transparency(StandardMaterial3D::TRANSPARENCY_ALPHA);
 		material->set_render_priority(StandardMaterial3D::RENDER_PRIORITY_MIN + 1);
+		material->set_cull_mode(StandardMaterial3D::CULL_DISABLED);
 
 		if (p_use_vertex_color) {
 			material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
diff --git a/editor/plugins/node_3d_editor_plugin.h b/editor/plugins/node_3d_editor_plugin.h
index 70329f90c7..33f4c32471 100644
--- a/editor/plugins/node_3d_editor_plugin.h
+++ b/editor/plugins/node_3d_editor_plugin.h
@@ -221,6 +221,7 @@ class Node3DEditorViewport : public Control {
 		VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+		VIEW_DISPLAY_DEBUG_OCCLUDERS,
 
 		VIEW_LOCK_ROTATION,
 		VIEW_CINEMATIC_PREVIEW,
diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.cpp b/editor/plugins/occluder_instance_3d_editor_plugin.cpp
new file mode 100644
index 0000000000..0821f140b3
--- /dev/null
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.cpp
@@ -0,0 +1,117 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.cpp                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d_editor_plugin.h"
+
+void OccluderInstance3DEditorPlugin::_bake_select_file(const String &p_file) {
+	if (occluder_instance) {
+		OccluderInstance3D::BakeError err;
+		if (get_tree()->get_edited_scene_root() && get_tree()->get_edited_scene_root() == occluder_instance) {
+			err = occluder_instance->bake(occluder_instance, p_file);
+		} else {
+			err = occluder_instance->bake(occluder_instance->get_parent(), p_file);
+		}
+
+		switch (err) {
+			case OccluderInstance3D::BAKE_ERROR_NO_SAVE_PATH: {
+				String scene_path = occluder_instance->get_filename();
+				if (scene_path == String()) {
+					scene_path = occluder_instance->get_owner()->get_filename();
+				}
+				if (scene_path == String()) {
+					EditorNode::get_singleton()->show_warning(TTR("Can't determine a save path for the occluder.\nSave your scene and try again."));
+					break;
+				}
+				scene_path = scene_path.get_basename() + ".occ";
+
+				file_dialog->set_current_path(scene_path);
+				file_dialog->popup_file_dialog();
+
+			} break;
+			case OccluderInstance3D::BAKE_ERROR_NO_MESHES: {
+				EditorNode::get_singleton()->show_warning(TTR("No meshes to bake."));
+				break;
+			}
+			default: {
+			}
+		}
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bake() {
+	_bake_select_file("");
+}
+
+void OccluderInstance3DEditorPlugin::edit(Object *p_object) {
+	OccluderInstance3D *s = Object::cast_to<OccluderInstance3D>(p_object);
+	if (!s) {
+		return;
+	}
+
+	occluder_instance = s;
+}
+
+bool OccluderInstance3DEditorPlugin::handles(Object *p_object) const {
+	return p_object->is_class("OccluderInstance3D");
+}
+
+void OccluderInstance3DEditorPlugin::make_visible(bool p_visible) {
+	if (p_visible) {
+		bake->show();
+	} else {
+		bake->hide();
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bind_methods() {
+	ClassDB::bind_method("_bake", &OccluderInstance3DEditorPlugin::_bake);
+}
+
+OccluderInstance3DEditorPlugin::OccluderInstance3DEditorPlugin(EditorNode *p_node) {
+	editor = p_node;
+	bake = memnew(Button);
+	bake->set_flat(true);
+	bake->set_icon(editor->get_gui_base()->get_theme_icon("Bake", "EditorIcons"));
+	bake->set_text(TTR("Bake Occluders"));
+	bake->hide();
+	bake->connect("pressed", Callable(this, "_bake"));
+	add_control_to_container(CONTAINER_SPATIAL_EDITOR_MENU, bake);
+	occluder_instance = nullptr;
+
+	file_dialog = memnew(EditorFileDialog);
+	file_dialog->set_file_mode(EditorFileDialog::FILE_MODE_SAVE_FILE);
+	file_dialog->add_filter("*.occ ; Occluder3D");
+	file_dialog->set_title(TTR("Select occluder bake file:"));
+	file_dialog->connect("file_selected", callable_mp(this, &OccluderInstance3DEditorPlugin::_bake_select_file));
+	bake->add_child(file_dialog);
+}
+
+OccluderInstance3DEditorPlugin::~OccluderInstance3DEditorPlugin() {
+}
diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.h b/editor/plugins/occluder_instance_3d_editor_plugin.h
new file mode 100644
index 0000000000..161b17811c
--- /dev/null
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.h
@@ -0,0 +1,66 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.h                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+#define OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+
+#include "editor/editor_node.h"
+#include "editor/editor_plugin.h"
+#include "scene/3d/occluder_instance_3d.h"
+#include "scene/resources/material.h"
+
+class OccluderInstance3DEditorPlugin : public EditorPlugin {
+	GDCLASS(OccluderInstance3DEditorPlugin, EditorPlugin);
+
+	OccluderInstance3D *occluder_instance;
+
+	Button *bake;
+	EditorNode *editor;
+
+	EditorFileDialog *file_dialog;
+
+	void _bake_select_file(const String &p_file);
+	void _bake();
+
+protected:
+	static void _bind_methods();
+
+public:
+	virtual String get_name() const override { return "OccluderInstance3D"; }
+	bool has_main_screen() const override { return false; }
+	virtual void edit(Object *p_object) override;
+	virtual bool handles(Object *p_object) const override;
+	virtual void make_visible(bool p_visible) override;
+
+	OccluderInstance3DEditorPlugin(EditorNode *p_node);
+	~OccluderInstance3DEditorPlugin();
+};
+
+#endif
diff --git a/modules/csg/doc_classes/CSGMesh3D.xml b/modules/csg/doc_classes/CSGMesh3D.xml
index 1bab8f4ee9..babdac0e98 100644
--- a/modules/csg/doc_classes/CSGMesh3D.xml
+++ b/modules/csg/doc_classes/CSGMesh3D.xml
@@ -16,6 +16,7 @@
 		</member>
 		<member name="mesh" type="Mesh" setter="set_mesh" getter="get_mesh">
 			The [Mesh] resource to use as a CSG shape.
+			[b]Note:[/b] When using an [ArrayMesh], avoid meshes with vertex normals unless a flat shader is required. By default, CSGMesh will ignore the mesh's vertex normals and use a smooth shader calculated using the faces' normals. If a flat shader is required, ensure that all faces' vertex normals are parallel.
 		</member>
 	</members>
 	<constants>
diff --git a/modules/enet/networked_multiplayer_enet.cpp b/modules/enet/networked_multiplayer_enet.cpp
index 36ffef3967..1cf77b307d 100644
--- a/modules/enet/networked_multiplayer_enet.cpp
+++ b/modules/enet/networked_multiplayer_enet.cpp
@@ -553,7 +553,7 @@ Error NetworkedMultiplayerENet::put_packet(const uint8_t *p_buffer, int p_buffer
 	ENetPacket *packet = enet_packet_create(nullptr, p_buffer_size + 8, packet_flags);
 	encode_uint32(unique_id, &packet->data[0]); // Source ID
 	encode_uint32(target_peer, &packet->data[4]); // Dest ID
-	copymem(&packet->data[8], p_buffer, p_buffer_size);
+	memcpy(&packet->data[8], p_buffer, p_buffer_size);
 
 	if (server) {
 		if (target_peer == 0) {
@@ -664,7 +664,7 @@ size_t NetworkedMultiplayerENet::enet_compress(void *context, const ENetBuffer *
 	while (total) {
 		for (size_t i = 0; i < inBufferCount; i++) {
 			int to_copy = MIN(total, int(inBuffers[i].dataLength));
-			copymem(&enet->src_compressor_mem.write[ofs], inBuffers[i].data, to_copy);
+			memcpy(&enet->src_compressor_mem.write[ofs], inBuffers[i].data, to_copy);
 			ofs += to_copy;
 			total -= to_copy;
 		}
@@ -701,7 +701,7 @@ size_t NetworkedMultiplayerENet::enet_compress(void *context, const ENetBuffer *
 		return 0; // Do not bother
 	}
 
-	copymem(outData, enet->dst_compressor_mem.ptr(), ret);
+	memcpy(outData, enet->dst_compressor_mem.ptr(), ret);
 
 	return ret;
 }
diff --git a/modules/gdnative/nativescript/nativescript.h b/modules/gdnative/nativescript/nativescript.h
index d6ba2bbec1..4bd54f9c46 100644
--- a/modules/gdnative/nativescript/nativescript.h
+++ b/modules/gdnative/nativescript/nativescript.h
@@ -90,8 +90,8 @@ struct NativeScriptDesc {
 	bool is_tool = false;
 
 	inline NativeScriptDesc() {
-		zeromem(&create_func, sizeof(godot_nativescript_instance_create_func));
-		zeromem(&destroy_func, sizeof(godot_nativescript_instance_destroy_func));
+		memset(&create_func, 0, sizeof(godot_nativescript_instance_create_func));
+		memset(&destroy_func, 0, sizeof(godot_nativescript_instance_destroy_func));
 	}
 };
 
diff --git a/modules/gdscript/gdscript.h b/modules/gdscript/gdscript.h
index 12c909fd4f..98da5ad4cb 100644
--- a/modules/gdscript/gdscript.h
+++ b/modules/gdscript/gdscript.h
@@ -270,6 +270,7 @@ public:
 class GDScriptInstance : public ScriptInstance {
 	friend class GDScript;
 	friend class GDScriptFunction;
+	friend class GDScriptLambdaCallable;
 	friend class GDScriptCompiler;
 	friend struct GDScriptUtilityFunctionsDefinitions;
 
diff --git a/modules/gdscript/gdscript_analyzer.cpp b/modules/gdscript/gdscript_analyzer.cpp
index 5da2bb5cc1..17ae52f3ab 100644
--- a/modules/gdscript/gdscript_analyzer.cpp
+++ b/modules/gdscript/gdscript_analyzer.cpp
@@ -856,6 +856,7 @@ void GDScriptAnalyzer::resolve_node(GDScriptParser::Node *p_node) {
 		case GDScriptParser::Node::DICTIONARY:
 		case GDScriptParser::Node::GET_NODE:
 		case GDScriptParser::Node::IDENTIFIER:
+		case GDScriptParser::Node::LAMBDA:
 		case GDScriptParser::Node::LITERAL:
 		case GDScriptParser::Node::PRELOAD:
 		case GDScriptParser::Node::SELF:
@@ -1458,6 +1459,9 @@ void GDScriptAnalyzer::reduce_expression(GDScriptParser::ExpressionNode *p_expre
 		case GDScriptParser::Node::IDENTIFIER:
 			reduce_identifier(static_cast<GDScriptParser::IdentifierNode *>(p_expression));
 			break;
+		case GDScriptParser::Node::LAMBDA:
+			reduce_lambda(static_cast<GDScriptParser::LambdaNode *>(p_expression));
+			break;
 		case GDScriptParser::Node::LITERAL:
 			reduce_literal(static_cast<GDScriptParser::LiteralNode *>(p_expression));
 			break;
@@ -2061,6 +2065,12 @@ void GDScriptAnalyzer::reduce_call(GDScriptParser::CallNode *p_call, bool is_awa
 		is_self = true;
 	} else if (callee_type == GDScriptParser::Node::SUBSCRIPT) {
 		GDScriptParser::SubscriptNode *subscript = static_cast<GDScriptParser::SubscriptNode *>(p_call->callee);
+		if (subscript->base == nullptr) {
+			// Invalid syntax, error already set on parser.
+			p_call->set_datatype(call_type);
+			mark_node_unsafe(p_call);
+			return;
+		}
 		if (!subscript->is_attribute) {
 			// Invalid call. Error already sent in parser.
 			// TODO: Could check if Callable here.
@@ -2097,6 +2107,8 @@ void GDScriptAnalyzer::reduce_call(GDScriptParser::CallNode *p_call, bool is_awa
 
 		if (is_self && parser->current_function != nullptr && parser->current_function->is_static && !is_static) {
 			push_error(vformat(R"*(Cannot call non-static function "%s()" from static function "%s()".)*", p_call->function_name, parser->current_function->identifier->name), p_call->callee);
+		} else if (is_self && !is_static && !lambda_stack.is_empty()) {
+			push_error(vformat(R"*(Cannot call non-static function "%s()" from a lambda function.)*", p_call->function_name), p_call->callee);
 		}
 
 		call_type = return_type;
@@ -2219,6 +2231,8 @@ void GDScriptAnalyzer::reduce_get_node(GDScriptParser::GetNodeNode *p_get_node)
 
 	if (!ClassDB::is_parent_class(GDScriptParser::get_real_class_name(parser->current_class->base_type.native_type), result.native_type)) {
 		push_error(R"*(Cannot use shorthand "get_node()" notation ("$") on a class that isn't a node.)*", p_get_node);
+	} else if (!lambda_stack.is_empty()) {
+		push_error(R"*(Cannot use shorthand "get_node()" notation ("$") inside a lambda. Use a captured variable instead.)*", p_get_node);
 	}
 
 	p_get_node->set_datatype(result);
@@ -2346,6 +2360,7 @@ void GDScriptAnalyzer::reduce_identifier_from_base(GDScriptParser::IdentifierNod
 				case GDScriptParser::ClassNode::Member::ENUM_VALUE:
 					p_identifier->is_constant = true;
 					p_identifier->reduced_value = member.enum_value.value;
+					p_identifier->source = GDScriptParser::IdentifierNode::MEMBER_CONSTANT;
 					break;
 				case GDScriptParser::ClassNode::Member::VARIABLE:
 					p_identifier->source = GDScriptParser::IdentifierNode::MEMBER_VARIABLE;
@@ -2446,42 +2461,65 @@ void GDScriptAnalyzer::reduce_identifier(GDScriptParser::IdentifierNode *p_ident
 		}
 	}
 
+	bool found_source = false;
 	// Check if identifier is local.
 	// If that's the case, the declaration already was solved before.
 	switch (p_identifier->source) {
 		case GDScriptParser::IdentifierNode::FUNCTION_PARAMETER:
 			p_identifier->set_datatype(p_identifier->parameter_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_CONSTANT:
 		case GDScriptParser::IdentifierNode::MEMBER_CONSTANT:
 			p_identifier->set_datatype(p_identifier->constant_source->get_datatype());
 			p_identifier->is_constant = true;
 			// TODO: Constant should have a value on the node itself.
 			p_identifier->reduced_value = p_identifier->constant_source->initializer->reduced_value;
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::MEMBER_VARIABLE:
 			p_identifier->variable_source->usages++;
 			[[fallthrough]];
 		case GDScriptParser::IdentifierNode::LOCAL_VARIABLE:
 			p_identifier->set_datatype(p_identifier->variable_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_ITERATOR:
 			p_identifier->set_datatype(p_identifier->bind_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_BIND: {
 			GDScriptParser::DataType result = p_identifier->bind_source->get_datatype();
 			result.is_constant = true;
 			p_identifier->set_datatype(result);
-			return;
-		}
+			found_source = true;
+		} break;
 		case GDScriptParser::IdentifierNode::UNDEFINED_SOURCE:
 			break;
 	}
 
 	// Not a local, so check members.
-	reduce_identifier_from_base(p_identifier);
-	if (p_identifier->get_datatype().is_set()) {
-		// Found.
+	if (!found_source) {
+		reduce_identifier_from_base(p_identifier);
+		if (p_identifier->source != GDScriptParser::IdentifierNode::UNDEFINED_SOURCE || p_identifier->get_datatype().is_set()) {
+			// Found.
+			found_source = true;
+		}
+	}
+
+	if (found_source) {
+		// If the identifier is local, check if it's any kind of capture by comparing their source function.
+		// Only capture locals and members and enum values. Constants are still accessible from the lambda using the script reference.
+		if (p_identifier->source == GDScriptParser::IdentifierNode::UNDEFINED_SOURCE || p_identifier->source == GDScriptParser::IdentifierNode::MEMBER_CONSTANT || lambda_stack.is_empty()) {
+			return;
+		}
+
+		GDScriptParser::FunctionNode *function_test = lambda_stack.back()->get()->function;
+		while (function_test != nullptr && function_test != p_identifier->source_function && function_test->source_lambda != nullptr && !function_test->source_lambda->captures_indices.has(p_identifier->name)) {
+			function_test->source_lambda->captures_indices[p_identifier->name] = function_test->source_lambda->captures.size();
+			function_test->source_lambda->captures.push_back(p_identifier);
+			function_test = function_test->source_lambda->parent_function;
+		}
 		return;
 	}
 
@@ -2563,6 +2601,57 @@ void GDScriptAnalyzer::reduce_identifier(GDScriptParser::IdentifierNode *p_ident
 	p_identifier->set_datatype(dummy); // Just so type is set to something.
 }
 
+void GDScriptAnalyzer::reduce_lambda(GDScriptParser::LambdaNode *p_lambda) {
+	// Lambda is always a Callable.
+	GDScriptParser::DataType lambda_type;
+	lambda_type.type_source = GDScriptParser::DataType::ANNOTATED_INFERRED;
+	lambda_type.kind = GDScriptParser::DataType::BUILTIN;
+	lambda_type.builtin_type = Variant::CALLABLE;
+	p_lambda->set_datatype(lambda_type);
+
+	if (p_lambda->function == nullptr) {
+		return;
+	}
+
+	GDScriptParser::FunctionNode *previous_function = parser->current_function;
+	parser->current_function = p_lambda->function;
+
+	lambda_stack.push_back(p_lambda);
+
+	for (int i = 0; i < p_lambda->function->parameters.size(); i++) {
+		resolve_parameter(p_lambda->function->parameters[i]);
+	}
+
+	resolve_suite(p_lambda->function->body);
+
+	int captures_amount = p_lambda->captures.size();
+	if (captures_amount > 0) {
+		// Create space for lambda parameters.
+		// At the beginning to not mess with optional parameters.
+		int param_count = p_lambda->function->parameters.size();
+		p_lambda->function->parameters.resize(param_count + captures_amount);
+		for (int i = param_count - 1; i >= 0; i--) {
+			p_lambda->function->parameters.write[i + captures_amount] = p_lambda->function->parameters[i];
+			p_lambda->function->parameters_indices[p_lambda->function->parameters[i]->identifier->name] = i + captures_amount;
+		}
+
+		// Add captures as extra parameters at the beginning.
+		for (int i = 0; i < p_lambda->captures.size(); i++) {
+			GDScriptParser::IdentifierNode *capture = p_lambda->captures[i];
+			GDScriptParser::ParameterNode *capture_param = parser->alloc_node<GDScriptParser::ParameterNode>();
+			capture_param->identifier = capture;
+			capture_param->usages = capture->usages;
+			capture_param->set_datatype(capture->get_datatype());
+
+			p_lambda->function->parameters.write[i] = capture_param;
+			p_lambda->function->parameters_indices[capture->name] = i;
+		}
+	}
+
+	lambda_stack.pop_back();
+	parser->current_function = previous_function;
+}
+
 void GDScriptAnalyzer::reduce_literal(GDScriptParser::LiteralNode *p_literal) {
 	p_literal->reduced_value = p_literal->value;
 	p_literal->is_constant = true;
diff --git a/modules/gdscript/gdscript_analyzer.h b/modules/gdscript/gdscript_analyzer.h
index 8430d3f4a5..aabf407c76 100644
--- a/modules/gdscript/gdscript_analyzer.h
+++ b/modules/gdscript/gdscript_analyzer.h
@@ -42,6 +42,7 @@ class GDScriptAnalyzer {
 	HashMap<String, Ref<GDScriptParserRef>> depended_parsers;
 
 	const GDScriptParser::EnumNode *current_enum = nullptr;
+	List<const GDScriptParser::LambdaNode *> lambda_stack;
 
 	Error resolve_inheritance(GDScriptParser::ClassNode *p_class, bool p_recursive = true);
 	GDScriptParser::DataType resolve_datatype(GDScriptParser::TypeNode *p_type);
@@ -82,6 +83,7 @@ class GDScriptAnalyzer {
 	void reduce_get_node(GDScriptParser::GetNodeNode *p_get_node);
 	void reduce_identifier(GDScriptParser::IdentifierNode *p_identifier, bool can_be_builtin = false);
 	void reduce_identifier_from_base(GDScriptParser::IdentifierNode *p_identifier, GDScriptParser::DataType *p_base = nullptr);
+	void reduce_lambda(GDScriptParser::LambdaNode *p_lambda);
 	void reduce_literal(GDScriptParser::LiteralNode *p_literal);
 	void reduce_preload(GDScriptParser::PreloadNode *p_preload);
 	void reduce_self(GDScriptParser::SelfNode *p_self);
diff --git a/modules/gdscript/gdscript_byte_codegen.cpp b/modules/gdscript/gdscript_byte_codegen.cpp
index 89c5f5482b..0da99ccee3 100644
--- a/modules/gdscript/gdscript_byte_codegen.cpp
+++ b/modules/gdscript/gdscript_byte_codegen.cpp
@@ -383,6 +383,18 @@ GDScriptFunction *GDScriptByteCodeGenerator::write_end() {
 		function->_methods_count = 0;
 	}
 
+	if (lambdas_map.size()) {
+		function->lambdas.resize(lambdas_map.size());
+		function->_lambdas_ptr = function->lambdas.ptrw();
+		function->_lambdas_count = lambdas_map.size();
+		for (const Map<GDScriptFunction *, int>::Element *E = lambdas_map.front(); E; E = E->next()) {
+			function->lambdas.write[E->get()] = E->key();
+		}
+	} else {
+		function->_lambdas_ptr = nullptr;
+		function->_lambdas_count = 0;
+	}
+
 	if (debug_stack) {
 		function->stack_debug = stack_debug;
 	}
@@ -1118,6 +1130,17 @@ void GDScriptByteCodeGenerator::write_call_script_function(const Address &p_targ
 	append(p_function_name);
 }
 
+void GDScriptByteCodeGenerator::write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) {
+	append(GDScriptFunction::OPCODE_CREATE_LAMBDA, 1 + p_captures.size());
+	for (int i = 0; i < p_captures.size(); i++) {
+		append(p_captures[i]);
+	}
+
+	append(p_target);
+	append(p_captures.size());
+	append(p_function);
+}
+
 void GDScriptByteCodeGenerator::write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) {
 	// Try to find an appropriate constructor.
 	bool all_have_type = true;
diff --git a/modules/gdscript/gdscript_byte_codegen.h b/modules/gdscript/gdscript_byte_codegen.h
index 17d681d7bb..c060476f39 100644
--- a/modules/gdscript/gdscript_byte_codegen.h
+++ b/modules/gdscript/gdscript_byte_codegen.h
@@ -93,6 +93,7 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	Map<Variant::ValidatedUtilityFunction, int> utilities_map;
 	Map<GDScriptUtilityFunctions::FunctionPtr, int> gds_utilities_map;
 	Map<MethodBind *, int> method_bind_map;
+	Map<GDScriptFunction *, int> lambdas_map;
 
 	// Lists since these can be nested.
 	List<int> if_jmp_addrs;
@@ -293,6 +294,15 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 		return pos;
 	}
 
+	int get_lambda_function_pos(GDScriptFunction *p_lambda_function) {
+		if (lambdas_map.has(p_lambda_function)) {
+			return lambdas_map[p_lambda_function];
+		}
+		int pos = lambdas_map.size();
+		lambdas_map[p_lambda_function] = pos;
+		return pos;
+	}
+
 	void alloc_ptrcall(int p_params) {
 		if (p_params >= ptrcall_max) {
 			ptrcall_max = p_params;
@@ -386,6 +396,10 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 		opcodes.push_back(get_method_bind_pos(p_method));
 	}
 
+	void append(GDScriptFunction *p_lambda_function) {
+		opcodes.push_back(get_lambda_function_pos(p_lambda_function));
+	}
+
 	void patch_jump(int p_address) {
 		opcodes.write[p_address] = opcodes.size();
 	}
@@ -452,6 +466,7 @@ public:
 	virtual void write_call_self(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
 	virtual void write_call_self_async(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
 	virtual void write_call_script_function(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
+	virtual void write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) override;
 	virtual void write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) override;
 	virtual void write_construct_array(const Address &p_target, const Vector<Address> &p_arguments) override;
 	virtual void write_construct_typed_array(const Address &p_target, const GDScriptDataType &p_element_type, const Vector<Address> &p_arguments) override;
diff --git a/modules/gdscript/gdscript_codegen.h b/modules/gdscript/gdscript_codegen.h
index b377beefdb..ae9a8ede5e 100644
--- a/modules/gdscript/gdscript_codegen.h
+++ b/modules/gdscript/gdscript_codegen.h
@@ -127,6 +127,7 @@ public:
 	virtual void write_call_self(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
 	virtual void write_call_self_async(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
 	virtual void write_call_script_function(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
+	virtual void write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) = 0;
 	virtual void write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) = 0;
 	virtual void write_construct_array(const Address &p_target, const Vector<Address> &p_arguments) = 0;
 	virtual void write_construct_typed_array(const Address &p_target, const GDScriptDataType &p_element_type, const Vector<Address> &p_arguments) = 0;
diff --git a/modules/gdscript/gdscript_compiler.cpp b/modules/gdscript/gdscript_compiler.cpp
index 9b718db7cf..37ce8ae2cb 100644
--- a/modules/gdscript/gdscript_compiler.cpp
+++ b/modules/gdscript/gdscript_compiler.cpp
@@ -1091,6 +1091,34 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 			}
 			return GDScriptCodeGenerator::Address(); // Assignment does not return a value.
 		} break;
+		case GDScriptParser::Node::LAMBDA: {
+			const GDScriptParser::LambdaNode *lambda = static_cast<const GDScriptParser::LambdaNode *>(p_expression);
+			GDScriptCodeGenerator::Address result = codegen.add_temporary(_gdtype_from_datatype(lambda->get_datatype()));
+
+			Vector<GDScriptCodeGenerator::Address> captures;
+			captures.resize(lambda->captures.size());
+			for (int i = 0; i < lambda->captures.size(); i++) {
+				captures.write[i] = _parse_expression(codegen, r_error, lambda->captures[i]);
+				if (r_error) {
+					return GDScriptCodeGenerator::Address();
+				}
+			}
+
+			GDScriptFunction *function = _parse_function(r_error, codegen.script, codegen.class_node, lambda->function, false, true);
+			if (r_error) {
+				return GDScriptCodeGenerator::Address();
+			}
+
+			gen->write_lambda(result, function, captures);
+
+			for (int i = 0; i < captures.size(); i++) {
+				if (captures[i].mode == GDScriptCodeGenerator::Address::TEMPORARY) {
+					gen->pop_temporary();
+				}
+			}
+
+			return result;
+		} break;
 		default: {
 			ERR_FAIL_V_MSG(GDScriptCodeGenerator::Address(), "Bug in bytecode compiler, unexpected node in parse tree while parsing expression."); // Unreachable code.
 		} break;
@@ -1804,8 +1832,8 @@ Error GDScriptCompiler::_parse_block(CodeGen &codegen, const GDScriptParser::Sui
 	return OK;
 }
 
-Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready) {
-	Error error = OK;
+GDScriptFunction *GDScriptCompiler::_parse_function(Error &r_error, GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready, bool p_for_lambda) {
+	r_error = OK;
 	CodeGen codegen;
 	codegen.generator = memnew(GDScriptByteCodeGenerator);
 
@@ -1822,7 +1850,11 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	return_type.builtin_type = Variant::NIL;
 
 	if (p_func) {
-		func_name = p_func->identifier->name;
+		if (p_func->identifier) {
+			func_name = p_func->identifier->name;
+		} else {
+			func_name = "<anonymous lambda>";
+		}
 		is_static = p_func->is_static;
 		rpc_mode = p_func->rpc_mode;
 		return_type = _gdtype_from_datatype(p_func->get_datatype(), p_script);
@@ -1853,11 +1885,11 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	}
 
 	// Parse initializer if applies.
-	bool is_implicit_initializer = !p_for_ready && !p_func;
-	bool is_initializer = p_func && String(p_func->identifier->name) == GDScriptLanguage::get_singleton()->strings._init;
-	bool is_for_ready = p_for_ready || (p_func && String(p_func->identifier->name) == "_ready");
+	bool is_implicit_initializer = !p_for_ready && !p_func && !p_for_lambda;
+	bool is_initializer = p_func && !p_for_lambda && String(p_func->identifier->name) == GDScriptLanguage::get_singleton()->strings._init;
+	bool is_for_ready = p_for_ready || (p_func && !p_for_lambda && String(p_func->identifier->name) == "_ready");
 
-	if (is_implicit_initializer || is_for_ready) {
+	if (!p_for_lambda && (is_implicit_initializer || is_for_ready)) {
 		// Initialize class fields.
 		for (int i = 0; i < p_class->members.size(); i++) {
 			if (p_class->members[i].type != GDScriptParser::ClassNode::Member::VARIABLE) {
@@ -1884,10 +1916,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 						codegen.generator->write_construct_array(dst_address, Vector<GDScriptCodeGenerator::Address>());
 					}
 				}
-				GDScriptCodeGenerator::Address src_address = _parse_expression(codegen, error, field->initializer, false, true);
-				if (error) {
+				GDScriptCodeGenerator::Address src_address = _parse_expression(codegen, r_error, field->initializer, false, true);
+				if (r_error) {
 					memdelete(codegen.generator);
-					return error;
+					return nullptr;
 				}
 
 				codegen.generator->write_assign(dst_address, src_address);
@@ -1914,10 +1946,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			codegen.generator->start_parameters();
 			for (int i = p_func->parameters.size() - optional_parameters; i < p_func->parameters.size(); i++) {
 				const GDScriptParser::ParameterNode *parameter = p_func->parameters[i];
-				GDScriptCodeGenerator::Address src_addr = _parse_expression(codegen, error, parameter->default_value, true);
-				if (error) {
+				GDScriptCodeGenerator::Address src_addr = _parse_expression(codegen, r_error, parameter->default_value, true);
+				if (r_error) {
 					memdelete(codegen.generator);
-					return error;
+					return nullptr;
 				}
 				GDScriptCodeGenerator::Address dst_addr = codegen.parameters[parameter->identifier->name];
 				codegen.generator->write_assign_default_parameter(dst_addr, src_addr);
@@ -1928,10 +1960,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			codegen.generator->end_parameters();
 		}
 
-		Error err = _parse_block(codegen, p_func->body);
-		if (err) {
+		r_error = _parse_block(codegen, p_func->body);
+		if (r_error) {
 			memdelete(codegen.generator);
-			return err;
+			return nullptr;
 		}
 	}
 
@@ -1957,6 +1989,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			signature += "::" + String(func_name);
 		}
 
+		if (p_for_lambda) {
+			signature += "(lambda)";
+		}
+
 		codegen.generator->set_signature(signature);
 	}
 #endif
@@ -1964,8 +2000,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	if (p_func) {
 		codegen.generator->set_initial_line(p_func->start_line);
 #ifdef TOOLS_ENABLED
-		p_script->member_lines[func_name] = p_func->start_line;
-		p_script->doc_functions[func_name] = p_func->doc_description;
+		if (!p_for_lambda) {
+			p_script->member_lines[func_name] = p_func->start_line;
+			p_script->doc_functions[func_name] = p_func->doc_description;
+		}
 #endif
 	} else {
 		codegen.generator->set_initial_line(0);
@@ -1994,11 +2032,13 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 #endif
 	}
 
-	p_script->member_functions[func_name] = gd_function;
+	if (!p_for_lambda) {
+		p_script->member_functions[func_name] = gd_function;
+	}
 
 	memdelete(codegen.generator);
 
-	return OK;
+	return gd_function;
 }
 
 Error GDScriptCompiler::_parse_setter_getter(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::VariableNode *p_variable, bool p_is_setter) {
@@ -2391,7 +2431,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 			if (!has_ready && function->identifier->name == "_ready") {
 				has_ready = true;
 			}
-			Error err = _parse_function(p_script, p_class, function);
+			Error err = OK;
+			_parse_function(err, p_script, p_class, function);
 			if (err) {
 				return err;
 			}
@@ -2416,7 +2457,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 
 	{
 		// Create an implicit constructor in any case.
-		Error err = _parse_function(p_script, p_class, nullptr);
+		Error err = OK;
+		_parse_function(err, p_script, p_class, nullptr);
 		if (err) {
 			return err;
 		}
@@ -2424,7 +2466,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 
 	if (!has_ready && p_class->onready_used) {
 		//create a _ready constructor
-		Error err = _parse_function(p_script, p_class, nullptr, true);
+		Error err = OK;
+		_parse_function(err, p_script, p_class, nullptr, true);
 		if (err) {
 			return err;
 		}
diff --git a/modules/gdscript/gdscript_compiler.h b/modules/gdscript/gdscript_compiler.h
index c405eadb07..7d5bee93ac 100644
--- a/modules/gdscript/gdscript_compiler.h
+++ b/modules/gdscript/gdscript_compiler.h
@@ -128,7 +128,7 @@ class GDScriptCompiler {
 	GDScriptCodeGenerator::Address _parse_match_pattern(CodeGen &codegen, Error &r_error, const GDScriptParser::PatternNode *p_pattern, const GDScriptCodeGenerator::Address &p_value_addr, const GDScriptCodeGenerator::Address &p_type_addr, const GDScriptCodeGenerator::Address &p_previous_test, bool p_is_first, bool p_is_nested);
 	void _add_locals_in_block(CodeGen &codegen, const GDScriptParser::SuiteNode *p_block);
 	Error _parse_block(CodeGen &codegen, const GDScriptParser::SuiteNode *p_block, bool p_add_locals = true);
-	Error _parse_function(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready = false);
+	GDScriptFunction *_parse_function(Error &r_error, GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready = false, bool p_for_lambda = false);
 	Error _parse_setter_getter(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::VariableNode *p_variable, bool p_is_setter);
 	Error _parse_class_level(GDScript *p_script, const GDScriptParser::ClassNode *p_class, bool p_keep_state);
 	Error _parse_class_blocks(GDScript *p_script, const GDScriptParser::ClassNode *p_class, bool p_keep_state);
diff --git a/modules/gdscript/gdscript_disassembler.cpp b/modules/gdscript/gdscript_disassembler.cpp
index 0d0afcc741..789af57b4c 100644
--- a/modules/gdscript/gdscript_disassembler.cpp
+++ b/modules/gdscript/gdscript_disassembler.cpp
@@ -721,7 +721,7 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 				text += "await ";
 				text += DADDR(1);
 
-				incr += 2;
+				incr = 2;
 			} break;
 			case OPCODE_AWAIT_RESUME: {
 				text += "await resume ";
@@ -729,6 +729,25 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 
 				incr = 2;
 			} break;
+			case OPCODE_CREATE_LAMBDA: {
+				int captures_count = _code_ptr[ip + 1 + instr_var_args];
+				GDScriptFunction *lambda = _lambdas_ptr[_code_ptr[ip + 2 + instr_var_args]];
+
+				text += DADDR(1 + captures_count);
+				text += "create lambda from ";
+				text += lambda->name.operator String();
+				text += "function, captures (";
+
+				for (int i = 0; i < captures_count; i++) {
+					if (i > 0) {
+						text += ", ";
+					}
+					text += DADDR(1 + i);
+				}
+				text += ")";
+
+				incr = 3 + captures_count;
+			} break;
 			case OPCODE_JUMP: {
 				text += "jump ";
 				text += itos(_code_ptr[ip + 1]);
diff --git a/modules/gdscript/gdscript_function.cpp b/modules/gdscript/gdscript_function.cpp
index 7b37aa40a2..78399114a5 100644
--- a/modules/gdscript/gdscript_function.cpp
+++ b/modules/gdscript/gdscript_function.cpp
@@ -150,6 +150,10 @@ GDScriptFunction::GDScriptFunction() {
 }
 
 GDScriptFunction::~GDScriptFunction() {
+	for (int i = 0; i < lambdas.size(); i++) {
+		memdelete(lambdas[i]);
+	}
+
 #ifdef DEBUG_ENABLED
 
 	MutexLock lock(GDScriptLanguage::get_singleton()->lock);
diff --git a/modules/gdscript/gdscript_function.h b/modules/gdscript/gdscript_function.h
index fbec734a28..70b62ced6d 100644
--- a/modules/gdscript/gdscript_function.h
+++ b/modules/gdscript/gdscript_function.h
@@ -301,6 +301,7 @@ public:
 		OPCODE_CALL_PTRCALL_PACKED_COLOR_ARRAY,
 		OPCODE_AWAIT,
 		OPCODE_AWAIT_RESUME,
+		OPCODE_CREATE_LAMBDA,
 		OPCODE_JUMP,
 		OPCODE_JUMP_IF,
 		OPCODE_JUMP_IF_NOT,
@@ -459,6 +460,8 @@ private:
 	const GDScriptUtilityFunctions::FunctionPtr *_gds_utilities_ptr = nullptr;
 	int _methods_count = 0;
 	MethodBind **_methods_ptr = nullptr;
+	int _lambdas_count = 0;
+	GDScriptFunction **_lambdas_ptr = nullptr;
 	const int *_code_ptr = nullptr;
 	int _code_size = 0;
 	int _argument_count = 0;
@@ -488,6 +491,7 @@ private:
 	Vector<Variant::ValidatedUtilityFunction> utilities;
 	Vector<GDScriptUtilityFunctions::FunctionPtr> gds_utilities;
 	Vector<MethodBind *> methods;
+	Vector<GDScriptFunction *> lambdas;
 	Vector<int> code;
 	Vector<GDScriptDataType> argument_types;
 	GDScriptDataType return_type;
diff --git a/modules/gdscript/gdscript_lambda_callable.cpp b/modules/gdscript/gdscript_lambda_callable.cpp
new file mode 100644
index 0000000000..0bc109b6e1
--- /dev/null
+++ b/modules/gdscript/gdscript_lambda_callable.cpp
@@ -0,0 +1,95 @@
+/*************************************************************************/
+/*  gdscript_lambda_callable.cpp                                         */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "gdscript_lambda_callable.h"
+
+#include "core/templates/hashfuncs.h"
+#include "gdscript.h"
+
+bool GDScriptLambdaCallable::compare_equal(const CallableCustom *p_a, const CallableCustom *p_b) {
+	// Lambda callables are only compared by reference.
+	return p_a == p_b;
+}
+
+bool GDScriptLambdaCallable::compare_less(const CallableCustom *p_a, const CallableCustom *p_b) {
+	// Lambda callables are only compared by reference.
+	return p_a < p_b;
+}
+
+uint32_t GDScriptLambdaCallable::hash() const {
+	return h;
+}
+
+String GDScriptLambdaCallable::get_as_text() const {
+	if (function->get_name() != StringName()) {
+		return function->get_name().operator String() + "(lambda)";
+	}
+	return "(anonymous lambda)";
+}
+
+CallableCustom::CompareEqualFunc GDScriptLambdaCallable::get_compare_equal_func() const {
+	return compare_equal;
+}
+
+CallableCustom::CompareLessFunc GDScriptLambdaCallable::get_compare_less_func() const {
+	return compare_less;
+}
+
+ObjectID GDScriptLambdaCallable::get_object() const {
+	return script->get_instance_id();
+}
+
+void GDScriptLambdaCallable::call(const Variant **p_arguments, int p_argcount, Variant &r_return_value, Callable::CallError &r_call_error) const {
+	int captures_amount = captures.size();
+
+	if (captures_amount > 0) {
+		Vector<const Variant *> args;
+		args.resize(p_argcount + captures_amount);
+		for (int i = 0; i < captures_amount; i++) {
+			args.write[i] = &captures[i];
+		}
+		for (int i = 0; i < p_argcount; i++) {
+			args.write[i + captures_amount] = p_arguments[i];
+		}
+
+		r_return_value = function->call(nullptr, args.ptrw(), args.size(), r_call_error);
+		r_call_error.argument -= captures_amount;
+	} else {
+		r_return_value = function->call(nullptr, p_arguments, p_argcount, r_call_error);
+	}
+}
+
+GDScriptLambdaCallable::GDScriptLambdaCallable(Ref<GDScript> p_script, GDScriptFunction *p_function, const Vector<Variant> &p_captures) {
+	script = p_script;
+	function = p_function;
+	captures = p_captures;
+
+	h = (uint32_t)hash_djb2_one_64((uint64_t)this);
+}
diff --git a/modules/gdscript/gdscript_lambda_callable.h b/modules/gdscript/gdscript_lambda_callable.h
new file mode 100644
index 0000000000..357c845250
--- /dev/null
+++ b/modules/gdscript/gdscript_lambda_callable.h
@@ -0,0 +1,65 @@
+/*************************************************************************/
+/*  gdscript_lambda_callable.h                                           */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef GDSCRIPT_LAMBDA_CALLABLE
+#define GDSCRIPT_LAMBDA_CALLABLE
+
+#include "core/object/reference.h"
+#include "core/templates/vector.h"
+#include "core/variant/callable.h"
+#include "core/variant/variant.h"
+
+class GDScript;
+class GDScriptFunction;
+class GDScriptInstance;
+
+class GDScriptLambdaCallable : public CallableCustom {
+	GDScriptFunction *function = nullptr;
+	Ref<GDScript> script;
+	uint32_t h;
+
+	Vector<Variant> captures;
+
+	static bool compare_equal(const CallableCustom *p_a, const CallableCustom *p_b);
+	static bool compare_less(const CallableCustom *p_a, const CallableCustom *p_b);
+
+public:
+	uint32_t hash() const override;
+	String get_as_text() const override;
+	CompareEqualFunc get_compare_equal_func() const override;
+	CompareLessFunc get_compare_less_func() const override;
+	ObjectID get_object() const override;
+	void call(const Variant **p_arguments, int p_argcount, Variant &r_return_value, Callable::CallError &r_call_error) const override;
+
+	GDScriptLambdaCallable(Ref<GDScript> p_script, GDScriptFunction *p_function, const Vector<Variant> &p_captures);
+	virtual ~GDScriptLambdaCallable() = default;
+};
+
+#endif // GDSCRIPT_LAMBDA_CALLABLE
diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index d910137510..f9027c3a87 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -402,6 +402,8 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 }
 
 GDScriptTokenizer::Token GDScriptParser::advance() {
+	lambda_ended = false; // Empty marker since we're past the end in any case.
+
 	if (current.type == GDScriptTokenizer::Token::TK_EOF) {
 		ERR_FAIL_COND_V_MSG(current.type == GDScriptTokenizer::Token::TK_EOF, current, "GDScript parser bug: Trying to advance past the end of stream.");
 	}
@@ -428,7 +430,7 @@ bool GDScriptParser::match(GDScriptTokenizer::Token::Type p_token_type) {
 	return true;
 }
 
-bool GDScriptParser::check(GDScriptTokenizer::Token::Type p_token_type) {
+bool GDScriptParser::check(GDScriptTokenizer::Token::Type p_token_type) const {
 	if (p_token_type == GDScriptTokenizer::Token::IDENTIFIER) {
 		return current.is_identifier();
 	}
@@ -443,7 +445,7 @@ bool GDScriptParser::consume(GDScriptTokenizer::Token::Type p_token_type, const
 	return false;
 }
 
-bool GDScriptParser::is_at_end() {
+bool GDScriptParser::is_at_end() const {
 	return check(GDScriptTokenizer::Token::TK_EOF);
 }
 
@@ -494,16 +496,34 @@ void GDScriptParser::pop_multiline() {
 	tokenizer.set_multiline_mode(multiline_stack.size() > 0 ? multiline_stack.back()->get() : false);
 }
 
-bool GDScriptParser::is_statement_end() {
+bool GDScriptParser::is_statement_end_token() const {
 	return check(GDScriptTokenizer::Token::NEWLINE) || check(GDScriptTokenizer::Token::SEMICOLON) || check(GDScriptTokenizer::Token::TK_EOF);
 }
 
+bool GDScriptParser::is_statement_end() const {
+	return lambda_ended || in_lambda || is_statement_end_token();
+}
+
 void GDScriptParser::end_statement(const String &p_context) {
 	bool found = false;
 	while (is_statement_end() && !is_at_end()) {
 		// Remove sequential newlines/semicolons.
+		if (is_statement_end_token()) {
+			// Only consume if this is an actual token.
+			advance();
+		} else if (lambda_ended) {
+			lambda_ended = false; // Consume this "token".
+			found = true;
+			break;
+		} else {
+			if (!found) {
+				lambda_ended = true; // Mark the lambda as done since we found something else to end the statement.
+				found = true;
+			}
+			break;
+		}
+
 		found = true;
-		advance();
 	}
 	if (!found && !is_at_end()) {
 		push_error(vformat(R"(Expected end of statement after %s, found "%s" instead.)", p_context, current.get_name()));
@@ -1182,36 +1202,7 @@ GDScriptParser::EnumNode *GDScriptParser::parse_enum() {
 	return enum_node;
 }
 
-GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
-	bool _static = false;
-	if (previous.type == GDScriptTokenizer::Token::STATIC) {
-		// TODO: Improve message if user uses "static" with "var" or "const"
-		if (!consume(GDScriptTokenizer::Token::FUNC, R"(Expected "func" after "static".)")) {
-			return nullptr;
-		}
-		_static = true;
-	}
-
-	FunctionNode *function = alloc_node<FunctionNode>();
-	make_completion_context(COMPLETION_OVERRIDE_METHOD, function);
-
-	if (!consume(GDScriptTokenizer::Token::IDENTIFIER, R"(Expected function name after "func".)")) {
-		return nullptr;
-	}
-
-	FunctionNode *previous_function = current_function;
-	current_function = function;
-
-	function->identifier = parse_identifier();
-	function->is_static = _static;
-
-	push_multiline(true);
-	consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after function name.)");
-
-	SuiteNode *body = alloc_node<SuiteNode>();
-	SuiteNode *previous_suite = current_suite;
-	current_suite = body;
-
+void GDScriptParser::parse_function_signature(FunctionNode *p_function, SuiteNode *p_body, const String &p_type) {
 	if (!check(GDScriptTokenizer::Token::PARENTHESIS_CLOSE) && !is_at_end()) {
 		bool default_used = false;
 		do {
@@ -1231,29 +1222,61 @@ GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
 					continue;
 				}
 			}
-			if (function->parameters_indices.has(parameter->identifier->name)) {
-				push_error(vformat(R"(Parameter with name "%s" was already declared for this function.)", parameter->identifier->name));
+			if (p_function->parameters_indices.has(parameter->identifier->name)) {
+				push_error(vformat(R"(Parameter with name "%s" was already declared for this %s.)", parameter->identifier->name, p_type));
 			} else {
-				function->parameters_indices[parameter->identifier->name] = function->parameters.size();
-				function->parameters.push_back(parameter);
-				body->add_local(parameter);
+				p_function->parameters_indices[parameter->identifier->name] = p_function->parameters.size();
+				p_function->parameters.push_back(parameter);
+				p_body->add_local(parameter, current_function);
 			}
 		} while (match(GDScriptTokenizer::Token::COMMA));
 	}
 
 	pop_multiline();
-	consume(GDScriptTokenizer::Token::PARENTHESIS_CLOSE, R"*(Expected closing ")" after function parameters.)*");
+	consume(GDScriptTokenizer::Token::PARENTHESIS_CLOSE, vformat(R"*(Expected closing ")" after %s parameters.)*", p_type));
 
 	if (match(GDScriptTokenizer::Token::FORWARD_ARROW)) {
-		make_completion_context(COMPLETION_TYPE_NAME_OR_VOID, function);
-		function->return_type = parse_type(true);
-		if (function->return_type == nullptr) {
+		make_completion_context(COMPLETION_TYPE_NAME_OR_VOID, p_function);
+		p_function->return_type = parse_type(true);
+		if (p_function->return_type == nullptr) {
 			push_error(R"(Expected return type or "void" after "->".)");
 		}
 	}
 
 	// TODO: Improve token consumption so it synchronizes to a statement boundary. This way we can get into the function body with unrecognized tokens.
-	consume(GDScriptTokenizer::Token::COLON, R"(Expected ":" after function declaration.)");
+	consume(GDScriptTokenizer::Token::COLON, vformat(R"(Expected ":" after %s declaration.)", p_type));
+}
+
+GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
+	bool _static = false;
+	if (previous.type == GDScriptTokenizer::Token::STATIC) {
+		// TODO: Improve message if user uses "static" with "var" or "const"
+		if (!consume(GDScriptTokenizer::Token::FUNC, R"(Expected "func" after "static".)")) {
+			return nullptr;
+		}
+		_static = true;
+	}
+
+	FunctionNode *function = alloc_node<FunctionNode>();
+	make_completion_context(COMPLETION_OVERRIDE_METHOD, function);
+
+	if (!consume(GDScriptTokenizer::Token::IDENTIFIER, R"(Expected function name after "func".)")) {
+		return nullptr;
+	}
+
+	FunctionNode *previous_function = current_function;
+	current_function = function;
+
+	function->identifier = parse_identifier();
+	function->is_static = _static;
+
+	SuiteNode *body = alloc_node<SuiteNode>();
+	SuiteNode *previous_suite = current_suite;
+	current_suite = body;
+
+	push_multiline(true);
+	consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after function name.)");
+	parse_function_signature(function, body, "function");
 
 	current_suite = previous_suite;
 	function->body = parse_suite("function declaration", body);
@@ -1339,29 +1362,34 @@ bool GDScriptParser::register_annotation(const MethodInfo &p_info, uint32_t p_ta
 	return true;
 }
 
-GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context, SuiteNode *p_suite) {
+GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context, SuiteNode *p_suite, bool p_for_lambda) {
 	SuiteNode *suite = p_suite != nullptr ? p_suite : alloc_node<SuiteNode>();
 	suite->parent_block = current_suite;
+	suite->parent_function = current_function;
 	current_suite = suite;
 
 	bool multiline = false;
 
-	if (check(GDScriptTokenizer::Token::NEWLINE)) {
+	if (match(GDScriptTokenizer::Token::NEWLINE)) {
 		multiline = true;
 	}
 
 	if (multiline) {
-		consume(GDScriptTokenizer::Token::NEWLINE, vformat(R"(Expected newline after %s.)", p_context));
-
 		if (!consume(GDScriptTokenizer::Token::INDENT, vformat(R"(Expected indented block after %s.)", p_context))) {
 			current_suite = suite->parent_block;
 			return suite;
 		}
 	}
 
+	int error_count = 0;
+
 	do {
 		Node *statement = parse_statement();
 		if (statement == nullptr) {
+			if (error_count++ > 100) {
+				push_error("Too many statement errors.", suite);
+				break;
+			}
 			continue;
 		}
 		suite->statements.push_back(statement);
@@ -1374,7 +1402,7 @@ GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context,
 				if (local.type != SuiteNode::Local::UNDEFINED) {
 					push_error(vformat(R"(There is already a %s named "%s" declared in this scope.)", local.get_name(), variable->identifier->name));
 				}
-				current_suite->add_local(variable);
+				current_suite->add_local(variable, current_function);
 				break;
 			}
 			case Node::CONSTANT: {
@@ -1389,19 +1417,29 @@ GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context,
 					}
 					push_error(vformat(R"(There is already a %s named "%s" declared in this scope.)", name, constant->identifier->name));
 				}
-				current_suite->add_local(constant);
+				current_suite->add_local(constant, current_function);
 				break;
 			}
 			default:
 				break;
 		}
 
-	} while (multiline && !check(GDScriptTokenizer::Token::DEDENT) && !is_at_end());
+	} while (multiline && !check(GDScriptTokenizer::Token::DEDENT) && !lambda_ended && !is_at_end());
 
 	if (multiline) {
-		consume(GDScriptTokenizer::Token::DEDENT, vformat(R"(Missing unindent at the end of %s.)", p_context));
+		if (!lambda_ended) {
+			consume(GDScriptTokenizer::Token::DEDENT, vformat(R"(Missing unindent at the end of %s.)", p_context));
+
+		} else {
+			match(GDScriptTokenizer::Token::DEDENT);
+		}
+	} else if (previous.type == GDScriptTokenizer::Token::SEMICOLON) {
+		consume(GDScriptTokenizer::Token::NEWLINE, vformat(R"(Expected newline after ";" at the end of %s.)", p_context));
 	}
 
+	if (p_for_lambda) {
+		lambda_ended = true;
+	}
 	current_suite = suite->parent_block;
 	return suite;
 }
@@ -1458,6 +1496,10 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 					push_error(R"(Constructor cannot return a value.)");
 				}
 				n_return->return_value = parse_expression(false);
+			} else if (in_lambda && !is_statement_end_token()) {
+				// Try to parse it anyway as this might not be the statement end in a lambda.
+				// If this fails the expression will be nullptr, but that's the same as no return, so it's fine.
+				n_return->return_value = parse_expression(false);
 			}
 			result = n_return;
 
@@ -1486,10 +1528,18 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 		default: {
 			// Expression statement.
 			ExpressionNode *expression = parse_expression(true); // Allow assignment here.
+			bool has_ended_lambda = false;
 			if (expression == nullptr) {
-				push_error(vformat(R"(Expected statement, found "%s" instead.)", previous.get_name()));
+				if (in_lambda) {
+					// If it's not a valid expression beginning, it might be the continuation of the outer expression where this lambda is.
+					lambda_ended = true;
+					has_ended_lambda = true;
+				} else {
+					push_error(vformat(R"(Expected statement, found "%s" instead.)", previous.get_name()));
+				}
 			}
 			end_statement("expression");
+			lambda_ended = lambda_ended || has_ended_lambda;
 			result = expression;
 
 #ifdef DEBUG_ENABLED
@@ -1513,7 +1563,7 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 	if (unreachable && result != nullptr) {
 		current_suite->has_unreachable_code = true;
 		if (current_function) {
-			push_warning(result, GDScriptWarning::UNREACHABLE_CODE, current_function->identifier->name);
+			push_warning(result, GDScriptWarning::UNREACHABLE_CODE, current_function->identifier ? current_function->identifier->name : "<anonymous lambda>");
 		} else {
 			// TODO: Properties setters and getters with unreachable code are not being warned
 		}
@@ -1598,7 +1648,7 @@ GDScriptParser::ForNode *GDScriptParser::parse_for() {
 
 	SuiteNode *suite = alloc_node<SuiteNode>();
 	if (n_for->variable) {
-		suite->add_local(SuiteNode::Local(n_for->variable));
+		suite->add_local(SuiteNode::Local(n_for->variable, current_function));
 	}
 	suite->parent_for = n_for;
 
@@ -1753,7 +1803,7 @@ GDScriptParser::MatchBranchNode *GDScriptParser::parse_match_branch() {
 		branch->patterns[0]->binds.get_key_list(&binds);
 
 		for (List<StringName>::Element *E = binds.front(); E != nullptr; E = E->next()) {
-			SuiteNode::Local local(branch->patterns[0]->binds[E->get()]);
+			SuiteNode::Local local(branch->patterns[0]->binds[E->get()], current_function);
 			suite->add_local(local);
 		}
 	}
@@ -1953,7 +2003,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_precedence(Precedence p_pr
 	// Completion can appear whenever an expression is expected.
 	make_completion_context(COMPLETION_IDENTIFIER, nullptr);
 
-	GDScriptTokenizer::Token token = advance();
+	GDScriptTokenizer::Token token = current;
 	ParseFunction prefix_rule = get_rule(token.type)->prefix;
 
 	if (prefix_rule == nullptr) {
@@ -1961,6 +2011,8 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_precedence(Precedence p_pr
 		return nullptr;
 	}
 
+	advance(); // Only consume the token if there's a valid rule.
+
 	ExpressionNode *previous_operand = (this->*prefix_rule)(nullptr, p_can_assign);
 
 	while (p_precedence <= get_rule(current.type)->precedence) {
@@ -2002,6 +2054,8 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode
 
 	if (current_suite != nullptr && current_suite->has_local(identifier->name)) {
 		const SuiteNode::Local &declaration = current_suite->get_local(identifier->name);
+
+		identifier->source_function = declaration.source_function;
 		switch (declaration.type) {
 			case SuiteNode::Local::CONSTANT:
 				identifier->source = IdentifierNode::LOCAL_CONSTANT;
@@ -2055,6 +2109,9 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_self(ExpressionNode *p_pre
 	if (current_function && current_function->is_static) {
 		push_error(R"(Cannot use "self" inside a static function.)");
 	}
+	if (in_lambda) {
+		push_error(R"(Cannot use "self" inside a lambda.)");
+	}
 	SelfNode *self = alloc_node<SelfNode>();
 	self->current_class = current_class;
 	return self;
@@ -2488,7 +2545,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_attribute(ExpressionNode *
 
 	if (for_completion) {
 		bool is_builtin = false;
-		if (p_previous_operand->type == Node::IDENTIFIER) {
+		if (p_previous_operand && p_previous_operand->type == Node::IDENTIFIER) {
 			const IdentifierNode *id = static_cast<const IdentifierNode *>(p_previous_operand);
 			Variant::Type builtin_type = get_builtin_type(id->name);
 			if (builtin_type < Variant::VARIANT_MAX) {
@@ -2675,6 +2732,65 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_preload(ExpressionNode *p_
 	return preload;
 }
 
+GDScriptParser::ExpressionNode *GDScriptParser::parse_lambda(ExpressionNode *p_previous_operand, bool p_can_assign) {
+	LambdaNode *lambda = alloc_node<LambdaNode>();
+	lambda->parent_function = current_function;
+	FunctionNode *function = alloc_node<FunctionNode>();
+	function->source_lambda = lambda;
+
+	function->is_static = current_function != nullptr ? current_function->is_static : false;
+
+	if (match(GDScriptTokenizer::Token::IDENTIFIER)) {
+		function->identifier = parse_identifier();
+	}
+
+	bool multiline_context = multiline_stack.back()->get();
+
+	// Reset the multiline stack since we don't want the multiline mode one in the lambda body.
+	push_multiline(false);
+	if (multiline_context) {
+		tokenizer.push_expression_indented_block();
+	}
+
+	push_multiline(true); // For the parameters.
+	if (function->identifier) {
+		consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after lambda name.)");
+	} else {
+		consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after "func".)");
+	}
+
+	FunctionNode *previous_function = current_function;
+	current_function = function;
+
+	SuiteNode *body = alloc_node<SuiteNode>();
+	SuiteNode *previous_suite = current_suite;
+	current_suite = body;
+
+	parse_function_signature(function, body, "lambda");
+
+	current_suite = previous_suite;
+
+	bool previous_in_lambda = in_lambda;
+	in_lambda = true;
+
+	function->body = parse_suite("lambda declaration", body, true);
+
+	pop_multiline();
+
+	if (multiline_context) {
+		// If we're in multiline mode, we want to skip the spurious DEDENT and NEWLINE tokens.
+		while (check(GDScriptTokenizer::Token::DEDENT) || check(GDScriptTokenizer::Token::INDENT) || check(GDScriptTokenizer::Token::NEWLINE)) {
+			current = tokenizer.scan(); // Not advance() since we don't want to change the previous token.
+		}
+		tokenizer.pop_expression_indented_block();
+	}
+
+	current_function = previous_function;
+	in_lambda = previous_in_lambda;
+	lambda->function = function;
+	return lambda;
+}
+
 GDScriptParser::ExpressionNode *GDScriptParser::parse_invalid_token(ExpressionNode *p_previous_operand, bool p_can_assign) {
 	// Just for better error messages.
 	GDScriptTokenizer::Token::Type invalid = previous.type;
@@ -3019,7 +3135,7 @@ GDScriptParser::ParseRule *GDScriptParser::get_rule(GDScriptTokenizer::Token::Ty
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // CONST,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // ENUM,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // EXTENDS,
-		{ nullptr,                                          nullptr,                                        PREC_NONE }, // FUNC,
+		{ &GDScriptParser::parse_lambda,                    nullptr,                                        PREC_NONE }, // FUNC,
 		{ nullptr,                                          &GDScriptParser::parse_binary_operator,      	PREC_CONTENT_TEST }, // IN,
 		{ nullptr,                                          &GDScriptParser::parse_binary_operator,      	PREC_TYPE_TEST }, // IS,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // NAMESPACE,
@@ -3755,6 +3871,10 @@ void GDScriptParser::TreePrinter::print_dictionary(DictionaryNode *p_dictionary)
 }
 
 void GDScriptParser::TreePrinter::print_expression(ExpressionNode *p_expression) {
+	if (p_expression == nullptr) {
+		push_text("<invalid expression>");
+		return;
+	}
 	switch (p_expression->type) {
 		case Node::ARRAY:
 			print_array(static_cast<ArrayNode *>(p_expression));
@@ -3783,6 +3903,9 @@ void GDScriptParser::TreePrinter::print_expression(ExpressionNode *p_expression)
 		case Node::IDENTIFIER:
 			print_identifier(static_cast<IdentifierNode *>(p_expression));
 			break;
+		case Node::LAMBDA:
+			print_lambda(static_cast<LambdaNode *>(p_expression));
+			break;
 		case Node::LITERAL:
 			print_literal(static_cast<LiteralNode *>(p_expression));
 			break;
@@ -3842,12 +3965,17 @@ void GDScriptParser::TreePrinter::print_for(ForNode *p_for) {
 	decrease_indent();
 }
 
-void GDScriptParser::TreePrinter::print_function(FunctionNode *p_function) {
+void GDScriptParser::TreePrinter::print_function(FunctionNode *p_function, const String &p_context) {
 	for (const List<AnnotationNode *>::Element *E = p_function->annotations.front(); E != nullptr; E = E->next()) {
 		print_annotation(E->get());
 	}
-	push_text("Function ");
-	print_identifier(p_function->identifier);
+	push_text(p_context);
+	push_text(" ");
+	if (p_function->identifier) {
+		print_identifier(p_function->identifier);
+	} else {
+		push_text("<anonymous>");
+	}
 	push_text("( ");
 	for (int i = 0; i < p_function->parameters.size(); i++) {
 		if (i > 0) {
@@ -3901,6 +4029,18 @@ void GDScriptParser::TreePrinter::print_if(IfNode *p_if, bool p_is_elif) {
 	}
 }
 
+void GDScriptParser::TreePrinter::print_lambda(LambdaNode *p_lambda) {
+	print_function(p_lambda->function, "Lambda");
+	push_text("| captures [ ");
+	for (int i = 0; i < p_lambda->captures.size(); i++) {
+		if (i > 0) {
+			push_text(" , ");
+		}
+		push_text(p_lambda->captures[i]->name.operator String());
+	}
+	push_line(" ]");
+}
+
 void GDScriptParser::TreePrinter::print_literal(LiteralNode *p_literal) {
 	// Prefix for string types.
 	switch (p_literal->value.get_type()) {
diff --git a/modules/gdscript/gdscript_parser.h b/modules/gdscript/gdscript_parser.h
index 272d21ffce..b1b29a7bd1 100644
--- a/modules/gdscript/gdscript_parser.h
+++ b/modules/gdscript/gdscript_parser.h
@@ -76,6 +76,7 @@ public:
 	struct GetNodeNode;
 	struct IdentifierNode;
 	struct IfNode;
+	struct LambdaNode;
 	struct LiteralNode;
 	struct MatchNode;
 	struct MatchBranchNode;
@@ -267,6 +268,7 @@ public:
 			GET_NODE,
 			IDENTIFIER,
 			IF,
+			LAMBDA,
 			LITERAL,
 			MATCH,
 			MATCH_BRANCH,
@@ -728,6 +730,7 @@ public:
 		bool is_coroutine = false;
 		MultiplayerAPI::RPCMode rpc_mode = MultiplayerAPI::RPC_MODE_DISABLED;
 		MethodInfo info;
+		LambdaNode *source_lambda = nullptr;
 #ifdef TOOLS_ENABLED
 		Vector<Variant> default_arg_values;
 		String doc_description;
@@ -771,6 +774,7 @@ public:
 			VariableNode *variable_source;
 			IdentifierNode *bind_source;
 		};
+		FunctionNode *source_function = nullptr;
 
 		int usages = 0; // Useful for binds/iterator variable.
 
@@ -789,6 +793,21 @@ public:
 		}
 	};
 
+	struct LambdaNode : public ExpressionNode {
+		FunctionNode *function = nullptr;
+		FunctionNode *parent_function = nullptr;
+		Vector<IdentifierNode *> captures;
+		Map<StringName, int> captures_indices;
+
+		bool has_name() const {
+			return function && function->identifier;
+		}
+
+		LambdaNode() {
+			type = LAMBDA;
+		}
+	};
+
 	struct LiteralNode : public ExpressionNode {
 		Variant value;
 
@@ -942,6 +961,7 @@ public:
 				IdentifierNode *bind;
 			};
 			StringName name;
+			FunctionNode *source_function = nullptr;
 
 			int start_line = 0, end_line = 0;
 			int start_column = 0, end_column = 0;
@@ -951,10 +971,11 @@ public:
 			String get_name() const;
 
 			Local() {}
-			Local(ConstantNode *p_constant) {
+			Local(ConstantNode *p_constant, FunctionNode *p_source_function) {
 				type = CONSTANT;
 				constant = p_constant;
 				name = p_constant->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_constant->start_line;
 				end_line = p_constant->end_line;
@@ -963,10 +984,11 @@ public:
 				leftmost_column = p_constant->leftmost_column;
 				rightmost_column = p_constant->rightmost_column;
 			}
-			Local(VariableNode *p_variable) {
+			Local(VariableNode *p_variable, FunctionNode *p_source_function) {
 				type = VARIABLE;
 				variable = p_variable;
 				name = p_variable->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_variable->start_line;
 				end_line = p_variable->end_line;
@@ -975,10 +997,11 @@ public:
 				leftmost_column = p_variable->leftmost_column;
 				rightmost_column = p_variable->rightmost_column;
 			}
-			Local(ParameterNode *p_parameter) {
+			Local(ParameterNode *p_parameter, FunctionNode *p_source_function) {
 				type = PARAMETER;
 				parameter = p_parameter;
 				name = p_parameter->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_parameter->start_line;
 				end_line = p_parameter->end_line;
@@ -987,10 +1010,11 @@ public:
 				leftmost_column = p_parameter->leftmost_column;
 				rightmost_column = p_parameter->rightmost_column;
 			}
-			Local(IdentifierNode *p_identifier) {
+			Local(IdentifierNode *p_identifier, FunctionNode *p_source_function) {
 				type = FOR_VARIABLE;
 				bind = p_identifier;
 				name = p_identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_identifier->start_line;
 				end_line = p_identifier->end_line;
@@ -1015,9 +1039,9 @@ public:
 		bool has_local(const StringName &p_name) const;
 		const Local &get_local(const StringName &p_name) const;
 		template <class T>
-		void add_local(T *p_local) {
+		void add_local(T *p_local, FunctionNode *p_source_function) {
 			locals_indices[p_local->identifier->name] = locals.size();
-			locals.push_back(Local(p_local));
+			locals.push_back(Local(p_local, p_source_function));
 		}
 		void add_local(const Local &p_local) {
 			locals_indices[p_local.name] = locals.size();
@@ -1191,6 +1215,8 @@ private:
 	CompletionCall completion_call;
 	List<CompletionCall> completion_call_stack;
 	bool passed_cursor = false;
+	bool in_lambda = false;
+	bool lambda_ended = false; // Marker for when a lambda ends, to apply an end of statement if needed.
 
 	typedef bool (GDScriptParser::*AnnotationAction)(const AnnotationNode *p_annotation, Node *p_target);
 	struct AnnotationInfo {
@@ -1278,10 +1304,11 @@ private:
 
 	GDScriptTokenizer::Token advance();
 	bool match(GDScriptTokenizer::Token::Type p_token_type);
-	bool check(GDScriptTokenizer::Token::Type p_token_type);
+	bool check(GDScriptTokenizer::Token::Type p_token_type) const;
 	bool consume(GDScriptTokenizer::Token::Type p_token_type, const String &p_error_message);
-	bool is_at_end();
-	bool is_statement_end();
+	bool is_at_end() const;
+	bool is_statement_end_token() const;
+	bool is_statement_end() const;
 	void end_statement(const String &p_context);
 	void synchronize();
 	void push_multiline(bool p_state);
@@ -1299,7 +1326,8 @@ private:
 	EnumNode *parse_enum();
 	ParameterNode *parse_parameter();
 	FunctionNode *parse_function();
-	SuiteNode *parse_suite(const String &p_context, SuiteNode *p_suite = nullptr);
+	void parse_function_signature(FunctionNode *p_function, SuiteNode *p_body, const String &p_type);
+	SuiteNode *parse_suite(const String &p_context, SuiteNode *p_suite = nullptr, bool p_for_lambda = false);
 	// Annotations
 	AnnotationNode *parse_annotation(uint32_t p_valid_targets);
 	bool register_annotation(const MethodInfo &p_info, uint32_t p_target_kinds, AnnotationAction p_apply, int p_optional_arguments = 0, bool p_is_vararg = false);
@@ -1354,6 +1382,7 @@ private:
 	ExpressionNode *parse_await(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_attribute(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_subscript(ExpressionNode *p_previous_operand, bool p_can_assign);
+	ExpressionNode *parse_lambda(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_invalid_token(ExpressionNode *p_previous_operand, bool p_can_assign);
 	TypeNode *parse_type(bool p_allow_void = false);
 #ifdef TOOLS_ENABLED
@@ -1415,10 +1444,11 @@ public:
 		void print_expression(ExpressionNode *p_expression);
 		void print_enum(EnumNode *p_enum);
 		void print_for(ForNode *p_for);
-		void print_function(FunctionNode *p_function);
+		void print_function(FunctionNode *p_function, const String &p_context = "Function");
 		void print_get_node(GetNodeNode *p_get_node);
 		void print_if(IfNode *p_if, bool p_is_elif = false);
 		void print_identifier(IdentifierNode *p_identifier);
+		void print_lambda(LambdaNode *p_lambda);
 		void print_literal(LiteralNode *p_literal);
 		void print_match(MatchNode *p_match);
 		void print_match_branch(MatchBranchNode *p_match_branch);
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index e432dfc891..2e6388d92f 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -242,6 +242,16 @@ void GDScriptTokenizer::set_multiline_mode(bool p_state) {
 	multiline_mode = p_state;
 }
 
+void GDScriptTokenizer::push_expression_indented_block() {
+	indent_stack_stack.push_back(indent_stack);
+}
+
+void GDScriptTokenizer::pop_expression_indented_block() {
+	ERR_FAIL_COND(indent_stack_stack.size() == 0);
+	indent_stack = indent_stack_stack.back()->get();
+	indent_stack_stack.pop_back();
+}
+
 int GDScriptTokenizer::get_cursor_line() const {
 	return cursor_line;
 }
diff --git a/modules/gdscript/gdscript_tokenizer.h b/modules/gdscript/gdscript_tokenizer.h
index bea4b14019..84b82c07f0 100644
--- a/modules/gdscript/gdscript_tokenizer.h
+++ b/modules/gdscript/gdscript_tokenizer.h
@@ -217,6 +217,7 @@ private:
 	Token last_newline;
 	int pending_indents = 0;
 	List<int> indent_stack;
+	List<List<int>> indent_stack_stack; // For lambdas, which require manipulating the indentation point.
 	List<char32_t> paren_stack;
 	char32_t indent_char = '\0';
 	int position = 0;
@@ -263,6 +264,8 @@ public:
 	void set_multiline_mode(bool p_state);
 	bool is_past_cursor() const;
 	static String get_token_name(Token::Type p_token_type);
+	void push_expression_indented_block(); // For lambdas, or blocks inside expressions.
+	void pop_expression_indented_block(); // For lambdas, or blocks inside expressions.
 
 	GDScriptTokenizer();
 };
diff --git a/modules/gdscript/gdscript_vm.cpp b/modules/gdscript/gdscript_vm.cpp
index b47a4eb992..4757ec6ca9 100644
--- a/modules/gdscript/gdscript_vm.cpp
+++ b/modules/gdscript/gdscript_vm.cpp
@@ -33,6 +33,7 @@
 #include "core/core_string_names.h"
 #include "core/os/os.h"
 #include "gdscript.h"
+#include "gdscript_lambda_callable.h"
 
 Variant *GDScriptFunction::_get_variant(int p_address, GDScriptInstance *p_instance, Variant *p_stack, String &r_error) const {
 	int address = p_address & ADDR_MASK;
@@ -232,6 +233,7 @@ String GDScriptFunction::_get_call_error(const Callable::CallError &p_err, const
 		&&OPCODE_CALL_PTRCALL_PACKED_COLOR_ARRAY,    \
 		&&OPCODE_AWAIT,                              \
 		&&OPCODE_AWAIT_RESUME,                       \
+		&&OPCODE_CREATE_LAMBDA,                      \
 		&&OPCODE_JUMP,                               \
 		&&OPCODE_JUMP_IF,                            \
 		&&OPCODE_JUMP_IF_NOT,                        \
@@ -1452,13 +1454,17 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 				if (err.error != Callable::CallError::CALL_OK) {
 					String methodstr = *methodname;
 					String basestr = _get_var_type(base);
+					bool is_callable = false;
 
 					if (methodstr == "call") {
-						if (argc >= 1) {
+						if (argc >= 1 && base->get_type() != Variant::CALLABLE) {
 							methodstr = String(*argptrs[0]) + " (via call)";
 							if (err.error == Callable::CallError::CALL_ERROR_INVALID_ARGUMENT) {
 								err.argument += 1;
 							}
+						} else {
+							methodstr = base->operator String() + " (Callable)";
+							is_callable = true;
 						}
 					} else if (methodstr == "free") {
 						if (err.error == Callable::CallError::CALL_ERROR_INVALID_METHOD) {
@@ -1478,7 +1484,7 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 							}
 						}
 					}
-					err_text = _get_call_error(err, "function '" + methodstr + "' in base '" + basestr + "'", (const Variant **)argptrs);
+					err_text = _get_call_error(err, "function '" + methodstr + (is_callable ? "" : "' in base '" + basestr) + "'", (const Variant **)argptrs);
 					OPCODE_BREAK;
 				}
 #endif
@@ -2057,6 +2063,34 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 			}
 			DISPATCH_OPCODE;
 
+			OPCODE(OPCODE_CREATE_LAMBDA) {
+				CHECK_SPACE(2 + instr_arg_count);
+
+				ip += instr_arg_count;
+
+				int captures_count = _code_ptr[ip + 1];
+				GD_ERR_BREAK(captures_count < 0);
+
+				int lambda_index = _code_ptr[ip + 2];
+				GD_ERR_BREAK(lambda_index < 0 || lambda_index >= _lambdas_count);
+				GDScriptFunction *lambda = _lambdas_ptr[lambda_index];
+
+				Vector<Variant> captures;
+				captures.resize(captures_count);
+				for (int i = 0; i < captures_count; i++) {
+					GET_INSTRUCTION_ARG(arg, i);
+					captures.write[i] = *arg;
+				}
+
+				GDScriptLambdaCallable *callable = memnew(GDScriptLambdaCallable(Ref<GDScript>(script), lambda, captures));
+
+				GET_INSTRUCTION_ARG(result, captures_count);
+				*result = Callable(callable);
+
+				ip += 3;
+			}
+			DISPATCH_OPCODE;
+
 			OPCODE(OPCODE_JUMP) {
 				CHECK_SPACE(2);
 				int to = _code_ptr[ip + 1];
diff --git a/modules/gdscript/language_server/gdscript_language_protocol.cpp b/modules/gdscript/language_server/gdscript_language_protocol.cpp
index 912c9a174e..0432e7caea 100644
--- a/modules/gdscript/language_server/gdscript_language_protocol.cpp
+++ b/modules/gdscript/language_server/gdscript_language_protocol.cpp
@@ -32,7 +32,6 @@
 
 #include "core/config/project_settings.h"
 #include "core/io/json.h"
-#include "core/os/copymem.h"
 #include "editor/doc_tools.h"
 #include "editor/editor_log.h"
 #include "editor/editor_node.h"
diff --git a/modules/gdscript/tests/test_gdscript.cpp b/modules/gdscript/tests/test_gdscript.cpp
index e70f221c0a..36da64bbaa 100644
--- a/modules/gdscript/tests/test_gdscript.cpp
+++ b/modules/gdscript/tests/test_gdscript.cpp
@@ -66,7 +66,7 @@ static void test_tokenizer(const String &p_code, const Vector<String> &p_lines)
 		StringBuilder token;
 		token += " --> "; // Padding for line number.
 
-		for (int l = current.start_line; l <= current.end_line; l++) {
+		for (int l = current.start_line; l <= current.end_line && l <= p_lines.size(); l++) {
 			print_line(vformat("%04d %s", l, p_lines[l - 1]).replace("\t", tab));
 		}
 
@@ -118,6 +118,18 @@ static void test_parser(const String &p_code, const String &p_script_path, const
 			print_line(vformat("%02d:%02d: %s", error.line, error.column, error.message));
 		}
 	}
+
+	GDScriptAnalyzer analyzer(&parser);
+	analyzer.analyze();
+
+	if (err != OK) {
+		const List<GDScriptParser::ParserError> &errors = parser.get_errors();
+		for (const List<GDScriptParser::ParserError>::Element *E = errors.front(); E != nullptr; E = E->next()) {
+			const GDScriptParser::ParserError &error = E->get();
+			print_line(vformat("%02d:%02d: %s", error.line, error.column, error.message));
+		}
+	}
+
 #ifdef TOOLS_ENABLED
 	GDScriptParser::TreePrinter printer;
 	printer.print_tree(parser);
diff --git a/modules/glslang/register_types.cpp b/modules/glslang/register_types.cpp
index 14135265b9..4331daadfc 100644
--- a/modules/glslang/register_types.cpp
+++ b/modules/glslang/register_types.cpp
@@ -173,7 +173,7 @@ static Vector<uint8_t> _compile_shader_glsl(RenderingDevice::ShaderStage p_stage
 	ret.resize(SpirV.size() * sizeof(uint32_t));
 	{
 		uint8_t *w = ret.ptrw();
-		copymem(w, &SpirV[0], SpirV.size() * sizeof(uint32_t));
+		memcpy(w, &SpirV[0], SpirV.size() * sizeof(uint32_t));
 	}
 
 	return ret;
diff --git a/modules/gltf/gltf_document.cpp b/modules/gltf/gltf_document.cpp
index 027a054b70..5cb8e2974b 100644
--- a/modules/gltf/gltf_document.cpp
+++ b/modules/gltf/gltf_document.cpp
@@ -1157,7 +1157,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int8_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int8_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int8_t));
 			bv->byte_length = buffer.size() * sizeof(int8_t);
 		} break;
 		case COMPONENT_TYPE_UNSIGNED_BYTE: {
@@ -1203,7 +1203,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int16_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int16_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int16_t));
 			bv->byte_length = buffer.size() * sizeof(int16_t);
 		} break;
 		case COMPONENT_TYPE_UNSIGNED_SHORT: {
@@ -1227,7 +1227,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(uint16_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(uint16_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(uint16_t));
 			bv->byte_length = buffer.size() * sizeof(uint16_t);
 		} break;
 		case COMPONENT_TYPE_INT: {
@@ -1247,7 +1247,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int32_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int32_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int32_t));
 			bv->byte_length = buffer.size() * sizeof(int32_t);
 		} break;
 		case COMPONENT_TYPE_FLOAT: {
@@ -1267,7 +1267,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(float)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(float));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(float));
 			bv->byte_length = buffer.size() * sizeof(float);
 		} break;
 	}
@@ -2864,7 +2864,7 @@ Error GLTFDocument::_serialize_images(Ref<GLTFState> state, const String &p_path
 
 			bv->byte_length = buffer.size();
 			state->buffers.write[bi].resize(state->buffers[bi].size() + bv->byte_length);
-			copymem(&state->buffers.write[bi].write[bv->byte_offset], buffer.ptr(), buffer.size());
+			memcpy(&state->buffers.write[bi].write[bv->byte_offset], buffer.ptr(), buffer.size());
 			ERR_FAIL_COND_V(bv->byte_offset + bv->byte_length > state->buffers[bi].size(), ERR_FILE_CORRUPT);
 
 			state->buffer_views.push_back(bv);
diff --git a/modules/lightmapper_rd/lightmapper_rd.cpp b/modules/lightmapper_rd/lightmapper_rd.cpp
index 61ebabdfb6..9394e5c47e 100644
--- a/modules/lightmapper_rd/lightmapper_rd.cpp
+++ b/modules/lightmapper_rd/lightmapper_rd.cpp
@@ -432,10 +432,10 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 	triangle_indices.resize(triangle_sort.size());
 	Vector<uint32_t> grid_indices;
 	grid_indices.resize(grid_size * grid_size * grid_size * 2);
-	zeromem(grid_indices.ptrw(), grid_indices.size() * sizeof(uint32_t));
+	memset(grid_indices.ptrw(), 0, grid_indices.size() * sizeof(uint32_t));
 	Vector<bool> solid;
 	solid.resize(grid_size * grid_size * grid_size);
-	zeromem(solid.ptrw(), solid.size() * sizeof(bool));
+	memset(solid.ptrw(), 0, solid.size() * sizeof(bool));
 
 	{
 		uint32_t *tiw = triangle_indices.ptrw();
@@ -1674,7 +1674,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	if (probe_positions.size() > 0) {
 		probe_values.resize(probe_positions.size() * 9);
 		Vector<uint8_t> probe_data = rd->buffer_get_data(light_probe_buffer);
-		copymem(probe_values.ptrw(), probe_data.ptr(), probe_data.size());
+		memcpy(probe_values.ptrw(), probe_data.ptr(), probe_data.size());
 		rd->free(light_probe_buffer);
 
 #ifdef DEBUG_TEXTURES
@@ -1743,7 +1743,7 @@ Vector<Color> LightmapperRD::get_bake_probe_sh(int p_probe) const {
 	ERR_FAIL_INDEX_V(p_probe, probe_positions.size(), Vector<Color>());
 	Vector<Color> ret;
 	ret.resize(9);
-	copymem(ret.ptrw(), &probe_values[p_probe * 9], sizeof(Color) * 9);
+	memcpy(ret.ptrw(), &probe_values[p_probe * 9], sizeof(Color) * 9);
 	return ret;
 }
 
diff --git a/modules/mbedtls/crypto_mbedtls.cpp b/modules/mbedtls/crypto_mbedtls.cpp
index 73931b0365..987306af2a 100644
--- a/modules/mbedtls/crypto_mbedtls.cpp
+++ b/modules/mbedtls/crypto_mbedtls.cpp
@@ -409,7 +409,7 @@ Vector<uint8_t> CryptoMbedTLS::sign(HashingContext::HashType p_hash_type, Vector
 	int ret = mbedtls_pk_sign(&(key->pkey), type, p_hash.ptr(), size, buf, &sig_size, mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while signing: " + itos(ret));
 	out.resize(sig_size);
-	copymem(out.ptrw(), buf, sig_size);
+	memcpy(out.ptrw(), buf, sig_size);
 	return out;
 }
 
@@ -432,7 +432,7 @@ Vector<uint8_t> CryptoMbedTLS::encrypt(Ref<CryptoKey> p_key, Vector<uint8_t> p_p
 	int ret = mbedtls_pk_encrypt(&(key->pkey), p_plaintext.ptr(), p_plaintext.size(), buf, &size, sizeof(buf), mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while encrypting: " + itos(ret));
 	out.resize(size);
-	copymem(out.ptrw(), buf, size);
+	memcpy(out.ptrw(), buf, size);
 	return out;
 }
 
@@ -446,6 +446,6 @@ Vector<uint8_t> CryptoMbedTLS::decrypt(Ref<CryptoKey> p_key, Vector<uint8_t> p_c
 	int ret = mbedtls_pk_decrypt(&(key->pkey), p_ciphertext.ptr(), p_ciphertext.size(), buf, &size, sizeof(buf), mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while decrypting: " + itos(ret));
 	out.resize(size);
-	copymem(out.ptrw(), buf, size);
+	memcpy(out.ptrw(), buf, size);
 	return out;
 }
diff --git a/modules/mbedtls/packet_peer_mbed_dtls.cpp b/modules/mbedtls/packet_peer_mbed_dtls.cpp
index 8a6cdfb131..342ded6ea1 100644
--- a/modules/mbedtls/packet_peer_mbed_dtls.cpp
+++ b/modules/mbedtls/packet_peer_mbed_dtls.cpp
@@ -74,7 +74,7 @@ int PacketPeerMbedDTLS::bio_recv(void *ctx, unsigned char *buf, size_t len) {
 	if (err != OK) {
 		return MBEDTLS_ERR_SSL_INTERNAL_ERROR;
 	}
-	copymem(buf, buffer, buffer_size);
+	memcpy(buf, buffer, buffer_size);
 	return buffer_size;
 }
 
@@ -89,8 +89,8 @@ int PacketPeerMbedDTLS::_set_cookie() {
 	uint8_t client_id[18];
 	IP_Address addr = base->get_packet_address();
 	uint16_t port = base->get_packet_port();
-	copymem(client_id, addr.get_ipv6(), 16);
-	copymem(&client_id[16], (uint8_t *)&port, 2);
+	memcpy(client_id, addr.get_ipv6(), 16);
+	memcpy(&client_id[16], (uint8_t *)&port, 2);
 	return mbedtls_ssl_set_client_transport_id(ssl_ctx->get_context(), client_id, 18);
 }
 
diff --git a/modules/minimp3/audio_stream_mp3.cpp b/modules/minimp3/audio_stream_mp3.cpp
index aaa05a910c..24ec206191 100644
--- a/modules/minimp3/audio_stream_mp3.cpp
+++ b/modules/minimp3/audio_stream_mp3.cpp
@@ -172,7 +172,7 @@ void AudioStreamMP3::set_data(const Vector<uint8_t> &p_data) {
 	clear_data();
 
 	data = memalloc(src_data_len);
-	copymem(data, src_datar, src_data_len);
+	memcpy(data, src_datar, src_data_len);
 	data_len = src_data_len;
 }
 
@@ -183,7 +183,7 @@ Vector<uint8_t> AudioStreamMP3::get_data() const {
 		vdata.resize(data_len);
 		{
 			uint8_t *w = vdata.ptrw();
-			copymem(w, data, data_len);
+			memcpy(w, data, data_len);
 		}
 	}
 
diff --git a/modules/pvr/image_compress_pvrtc.cpp b/modules/pvr/image_compress_pvrtc.cpp
index d2d8976694..6cb9837f49 100644
--- a/modules/pvr/image_compress_pvrtc.cpp
+++ b/modules/pvr/image_compress_pvrtc.cpp
@@ -65,7 +65,7 @@ static void _compress_pvrtc1_4bpp(Image *p_img) {
 			img->get_mipmap_offset_size_and_dimensions(i, ofs, size, w, h);
 			Javelin::RgbaBitmap bm(w, h);
 			void *dst = (void *)bm.GetData();
-			copymem(dst, &r[ofs], size);
+			memcpy(dst, &r[ofs], size);
 			Javelin::ColorRgba<unsigned char> *dp = bm.GetData();
 			for (int j = 0; j < size / 4; j++) {
 				// Red and blue colors are swapped.
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
new file mode 100644
index 0000000000..68e9df5263
--- /dev/null
+++ b/modules/raycast/SCsub
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+embree_src = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+embree_dir = "#thirdparty/embree-aarch64/"
+
+env_embree = env_modules.Clone()
+embree_sources = [embree_dir + file for file in embree_src]
+env_embree.Prepend(CPPPATH=[embree_dir, embree_dir + "include"])
+env_embree.Append(CPPFLAGS=["-DEMBREE_TARGET_SSE2", "-DEMBREE_LOWEST_ISA", "-DTASKING_INTERNAL", "-DNDEBUG"])
+
+if not env_embree.msvc:
+    env_embree.Append(CPPFLAGS=["-msse2", "-mxsave"])
+    if env["platform"] == "windows":
+        env_embree.Append(CPPFLAGS=["-mstackrealign"])
+
+if env["platform"] == "windows":
+    if env.msvc:
+        env.Append(LINKFLAGS=["psapi.lib"])
+        env_embree.Append(CPPFLAGS=["-D__SSE2__", "-D__SSE__"])
+    else:
+        env.Append(LIBS=["psapi"])
+
+env_embree.disable_warnings()
+env_embree.add_source_files(env.modules_sources, embree_sources)
+
+env_raycast = env_modules.Clone()
+env_raycast.Prepend(CPPPATH=[embree_dir, embree_dir + "include", embree_dir + "common"])
+
+env_raycast.add_source_files(env.modules_sources, "*.cpp")
diff --git a/modules/raycast/config.py b/modules/raycast/config.py
new file mode 100644
index 0000000000..26493da41b
--- /dev/null
+++ b/modules/raycast/config.py
@@ -0,0 +1,12 @@
+def can_build(env, platform):
+    if platform == "android":
+        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
+
+    if platform == "javascript":
+        return False  # No SIMD support yet
+
+    return True
+
+
+def configure(env):
+    pass
diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py
new file mode 100644
index 0000000000..db4fa95c21
--- /dev/null
+++ b/modules/raycast/godot_update_embree.py
@@ -0,0 +1,260 @@
+import glob, os, shutil, subprocess, re
+
+include_dirs = [
+    "common/tasking",
+    "kernels/bvh",
+    "kernels/builders",
+    "common/sys",
+    "kernels",
+    "kernels/common",
+    "common/math",
+    "common/algorithms",
+    "common/lexers",
+    "common/simd",
+    "include/embree3",
+    "kernels/subdiv",
+    "kernels/geometry",
+]
+
+cpp_files = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+os.chdir("../../thirdparty")
+
+dir_name = "embree-aarch64"
+if os.path.exists(dir_name):
+    shutil.rmtree(dir_name)
+
+subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
+os.chdir("embree-tmp")
+
+commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
+
+all_files = set(cpp_files)
+
+dest_dir = os.path.join("..", dir_name)
+for include_dir in include_dirs:
+    headers = glob.iglob(os.path.join(include_dir, "*.h"))
+    all_files.update(headers)
+
+for f in all_files:
+    d = os.path.join(dest_dir, os.path.dirname(f))
+    if not os.path.exists(d):
+        os.makedirs(d)
+    shutil.copy2(f, d)
+
+with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
+    hash_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "{commit_hash}"
+"""
+    )
+
+with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
+    config_file.write(
+        """
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
+"""
+    )
+
+
+with open("CMakeLists.txt", "r") as cmake_file:
+    cmake_content = cmake_file.read()
+    major_version = int(re.compile(r"EMBREE_VERSION_MAJOR\s(\d+)").findall(cmake_content)[0])
+    minor_version = int(re.compile(r"EMBREE_VERSION_MINOR\s(\d+)").findall(cmake_content)[0])
+    patch_version = int(re.compile(r"EMBREE_VERSION_PATCH\s(\d+)").findall(cmake_content)[0])
+
+with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file:
+    config_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR {major_version}
+#define RTC_VERSION_MINOR {minor_version}
+#define RTC_VERSION_PATCH {patch_version}
+#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d}
+#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {{
+#  define RTC_NAMESPACE_END }}
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
+"""
+    )
+
+os.chdir("..")
+shutil.rmtree("embree-tmp")
diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp
new file mode 100644
index 0000000000..9039622d3d
--- /dev/null
+++ b/modules/raycast/lightmap_raycaster.cpp
@@ -0,0 +1,202 @@
+/*************************************************************************/
+/*  lightmap_raycaster.cpp                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "lightmap_raycaster.h"
+
+// From Embree.
+#include <math/vec2.h>
+#include <math/vec3.h>
+
+#include <pmmintrin.h>
+
+using namespace embree;
+
+LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() {
+	return memnew(LightmapRaycasterEmbree);
+}
+
+void LightmapRaycasterEmbree::make_default_raycaster() {
+	create_function = create_embree_raycaster;
+}
+
+void LightmapRaycasterEmbree::filter_function(const struct RTCFilterFunctionNArguments *p_args) {
+	RTCHit *hit = (RTCHit *)p_args->hit;
+
+	unsigned int geomID = hit->geomID;
+	float u = hit->u;
+	float v = hit->v;
+
+	LightmapRaycasterEmbree *scene = (LightmapRaycasterEmbree *)p_args->geometryUserPtr;
+	RTCGeometry geom = rtcGetGeometry(scene->embree_scene, geomID);
+
+	rtcInterpolate0(geom, hit->primID, hit->u, hit->v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, &hit->u, 2);
+
+	if (scene->alpha_textures.has(geomID)) {
+		const AlphaTextureData &alpha_texture = scene->alpha_textures[geomID];
+
+		if (alpha_texture.sample(hit->u, hit->v) < 128) {
+			p_args->valid[0] = 0;
+			return;
+		}
+	}
+
+	rtcInterpolate0(geom, hit->primID, u, v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, &hit->Ng_x, 3);
+}
+
+bool LightmapRaycasterEmbree::intersect(Ray &r_ray) {
+	RTCIntersectContext context;
+
+	rtcInitIntersectContext(&context);
+
+	rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray);
+	return r_ray.geomID != RTC_INVALID_GEOMETRY_ID;
+}
+
+void LightmapRaycasterEmbree::intersect(Vector<Ray> &r_rays) {
+	Ray *rays = r_rays.ptrw();
+	for (int i = 0; i < r_rays.size(); ++i) {
+		intersect(rays[i]);
+	}
+}
+
+void LightmapRaycasterEmbree::set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) {
+	if (p_alpha_texture.is_valid() && p_alpha_texture->get_size() != Vector2i()) {
+		AlphaTextureData tex;
+		tex.size = p_alpha_texture->get_size();
+		tex.data = p_alpha_texture->get_data();
+		alpha_textures.insert(p_id, tex);
+	}
+}
+
+float blerp(float c00, float c10, float c01, float c11, float tx, float ty) {
+	return Math::lerp(Math::lerp(c00, c10, tx), Math::lerp(c01, c11, tx), ty);
+}
+
+uint8_t LightmapRaycasterEmbree::AlphaTextureData::sample(float u, float v) const {
+	float x = u * size.x;
+	float y = v * size.y;
+	int xi = (int)x;
+	int yi = (int)y;
+
+	uint8_t texels[4];
+
+	for (int i = 0; i < 4; ++i) {
+		int sample_x = CLAMP(xi + i % 2, 0, size.x - 1);
+		int sample_y = CLAMP(yi + i / 2, 0, size.y - 1);
+		texels[i] = data[sample_y * size.x + sample_x];
+	}
+
+	return Math::round(blerp(texels[0], texels[1], texels[2], texels[3], x - xi, y - yi));
+}
+
+void LightmapRaycasterEmbree::add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) {
+	RTCGeometry embree_mesh = rtcNewGeometry(embree_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+
+	rtcSetGeometryVertexAttributeCount(embree_mesh, 2);
+
+	int vertex_count = p_vertices.size();
+
+	ERR_FAIL_COND(vertex_count % 3 != 0);
+	ERR_FAIL_COND(vertex_count != p_uv2s.size());
+
+	Vec3fa *embree_vertices = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	Vec2fa *embree_light_uvs = (Vec2fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vec2fa), vertex_count);
+	uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3);
+
+	Vec3fa *embree_normals = nullptr;
+	if (!p_normals.is_empty()) {
+		embree_normals = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	}
+
+	for (int i = 0; i < vertex_count; i++) {
+		embree_vertices[i] = Vec3fa(p_vertices[i].x, p_vertices[i].y, p_vertices[i].z);
+		embree_light_uvs[i] = Vec2fa(p_uv2s[i].x, p_uv2s[i].y);
+		if (embree_normals != nullptr) {
+			embree_normals[i] = Vec3fa(p_normals[i].x, p_normals[i].y, p_normals[i].z);
+		}
+		embree_triangles[i] = i;
+	}
+
+	rtcCommitGeometry(embree_mesh);
+	rtcSetGeometryIntersectFilterFunction(embree_mesh, filter_function);
+	rtcSetGeometryUserData(embree_mesh, this);
+	rtcAttachGeometryByID(embree_scene, embree_mesh, p_id);
+	rtcReleaseGeometry(embree_mesh);
+}
+
+void LightmapRaycasterEmbree::commit() {
+	rtcCommitScene(embree_scene);
+}
+
+void LightmapRaycasterEmbree::set_mesh_filter(const Set<int> &p_mesh_ids) {
+	for (Set<int>::Element *E = p_mesh_ids.front(); E; E = E->next()) {
+		rtcDisableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes = p_mesh_ids;
+}
+
+void LightmapRaycasterEmbree::clear_mesh_filter() {
+	for (Set<int>::Element *E = filter_meshes.front(); E; E = E->next()) {
+		rtcEnableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes.clear();
+}
+
+void embree_error_handler(void *p_user_data, RTCError p_code, const char *p_str) {
+	print_error("Embree error: " + String(p_str));
+}
+
+LightmapRaycasterEmbree::LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
+	embree_device = rtcNewDevice(nullptr);
+	rtcSetDeviceErrorFunction(embree_device, &embree_error_handler, nullptr);
+	embree_scene = rtcNewScene(embree_device);
+}
+
+LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+
+	if (embree_scene != nullptr) {
+		rtcReleaseScene(embree_scene);
+	}
+
+	if (embree_device != nullptr) {
+		rtcReleaseDevice(embree_device);
+	}
+}
+
+#endif
diff --git a/modules/raycast/lightmap_raycaster.h b/modules/raycast/lightmap_raycaster.h
new file mode 100644
index 0000000000..4c3de27837
--- /dev/null
+++ b/modules/raycast/lightmap_raycaster.h
@@ -0,0 +1,77 @@
+/*************************************************************************/
+/*  lightmap_raycaster.h                                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "core/object/object.h"
+#include "scene/3d/lightmapper.h"
+#include "scene/resources/mesh.h"
+
+#include <embree3/rtcore.h>
+
+class LightmapRaycasterEmbree : public LightmapRaycaster {
+	GDCLASS(LightmapRaycasterEmbree, LightmapRaycaster);
+
+private:
+	struct AlphaTextureData {
+		Vector<uint8_t> data;
+		Vector2i size;
+
+		uint8_t sample(float u, float v) const;
+	};
+
+	RTCDevice embree_device;
+	RTCScene embree_scene;
+
+	static void filter_function(const struct RTCFilterFunctionNArguments *p_args);
+
+	Map<unsigned int, AlphaTextureData> alpha_textures;
+	Set<int> filter_meshes;
+
+public:
+	virtual bool intersect(Ray &p_ray) override;
+
+	virtual void intersect(Vector<Ray> &r_rays) override;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) override;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) override;
+	virtual void commit() override;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) override;
+	virtual void clear_mesh_filter() override;
+
+	static LightmapRaycaster *create_embree_raycaster();
+	static void make_default_raycaster();
+
+	LightmapRaycasterEmbree();
+	~LightmapRaycasterEmbree();
+};
+
+#endif
diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp
new file mode 100644
index 0000000000..66558efa8c
--- /dev/null
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@@ -0,0 +1,583 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.cpp                                           */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "raycast_occlusion_cull.h"
+#include "core/config/project_settings.h"
+#include "core/templates/local_vector.h"
+
+#ifdef __SSE2__
+#include <pmmintrin.h>
+#endif
+
+RaycastOcclusionCull *RaycastOcclusionCull::raycast_singleton = nullptr;
+
+void RaycastOcclusionCull::RaycastHZBuffer::clear() {
+	HZBuffer::clear();
+
+	camera_rays.clear();
+	camera_ray_masks.clear();
+	packs_size = Size2i();
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	HZBuffer::resize(p_size);
+
+	packs_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE));
+	int ray_packets_count = packs_size.x * packs_size.y;
+	camera_rays.resize(ray_packets_count);
+	camera_ray_masks.resize(ray_packets_count * TILE_SIZE * TILE_SIZE);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
+	CameraRayThreadData td;
+	td.camera_matrix = p_cam_projection;
+	td.camera_transform = p_cam_transform;
+	td.camera_orthogonal = p_cam_orthogonal;
+	td.thread_count = p_thread_work_pool.get_thread_count();
+
+	p_thread_work_pool.do_work(td.thread_count, this, &RaycastHZBuffer::_camera_rays_threaded, &td);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, RaycastOcclusionCull::RaycastHZBuffer::CameraRayThreadData *p_data) {
+	uint32_t packs_total = camera_rays.size();
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * packs_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? packs_total : ((p_thread + 1) * packs_total / total_threads);
+	_generate_camera_rays(p_data->camera_transform, p_data->camera_matrix, p_data->camera_orthogonal, from, to);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to) {
+	Size2i buffer_size = sizes[0];
+
+	CameraMatrix inv_camera_matrix = p_cam_projection.inverse();
+	float z_far = p_cam_projection.get_z_far() * 1.05f;
+	debug_tex_range = z_far;
+
+	RayPacket *ray_packets = camera_rays.ptr();
+	uint32_t *ray_masks = camera_ray_masks.ptr();
+
+	for (int i = p_from; i < p_to; i++) {
+		RayPacket &packet = ray_packets[i];
+		int tile_x = (i % packs_size.x) * TILE_SIZE;
+		int tile_y = (i / packs_size.x) * TILE_SIZE;
+
+		for (int j = 0; j < TILE_RAYS; j++) {
+			float x = tile_x + j % TILE_SIZE;
+			float y = tile_y + j / TILE_SIZE;
+
+			ray_masks[i * TILE_RAYS + j] = ~0U;
+
+			if (x >= buffer_size.x || y >= buffer_size.y) {
+				ray_masks[i * TILE_RAYS + j] = 0U;
+			} else {
+				float u = x / (buffer_size.x - 1);
+				float v = y / (buffer_size.y - 1);
+				u = u * 2.0f - 1.0f;
+				v = v * 2.0f - 1.0f;
+
+				Plane pixel_proj = Plane(u, v, -1.0, 1.0);
+				Plane pixel_view = inv_camera_matrix.xform4(pixel_proj);
+				Vector3 pixel_world = p_cam_transform.xform(pixel_view.normal);
+
+				Vector3 dir;
+				if (p_cam_orthogonal) {
+					dir = -p_cam_transform.basis.get_axis(2);
+				} else {
+					dir = (pixel_world - p_cam_transform.origin).normalized();
+				}
+
+				packet.ray.org_x[j] = pixel_world.x;
+				packet.ray.org_y[j] = pixel_world.y;
+				packet.ray.org_z[j] = pixel_world.z;
+
+				packet.ray.dir_x[j] = dir.x;
+				packet.ray.dir_y[j] = dir.y;
+				packet.ray.dir_z[j] = dir.z;
+
+				packet.ray.tnear[j] = 0.0f;
+
+				packet.ray.time[j] = 0.0f;
+
+				packet.ray.flags[j] = 0;
+				packet.ray.mask[j] = -1;
+				packet.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID;
+			}
+
+			packet.ray.tfar[j] = z_far;
+		}
+	}
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::sort_rays() {
+	if (is_empty()) {
+		return;
+	}
+
+	Size2i buffer_size = sizes[0];
+	for (int i = 0; i < packs_size.y; i++) {
+		for (int j = 0; j < packs_size.x; j++) {
+			for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) {
+				for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) {
+					int x = j * TILE_SIZE + tile_j;
+					int y = i * TILE_SIZE + tile_i;
+					if (x >= buffer_size.x || y >= buffer_size.y) {
+						continue;
+					}
+					int k = tile_i * TILE_SIZE + tile_j;
+					int packet_index = i * packs_size.x + j;
+					mips[0][y * buffer_size.x + x] = camera_rays[packet_index].ray.tfar[k];
+				}
+			}
+		}
+	}
+}
+
+////////////////////////////////////////////////////////
+
+bool RaycastOcclusionCull::is_occluder(RID p_rid) {
+	return occluder_owner.owns(p_rid);
+}
+
+RID RaycastOcclusionCull::occluder_allocate() {
+	return occluder_owner.allocate_rid();
+}
+
+void RaycastOcclusionCull::occluder_initialize(RID p_occluder) {
+	Occluder *occluder = memnew(Occluder);
+	occluder_owner.initialize_rid(p_occluder, occluder);
+}
+
+void RaycastOcclusionCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+
+	occluder->vertices = p_vertices;
+	occluder->indices = p_indices;
+
+	for (Set<InstanceID>::Element *E = occluder->users.front(); E; E = E->next()) {
+		RID scenario_rid = E->get().scenario;
+		RID instance_rid = E->get().instance;
+		ERR_CONTINUE(!scenarios.has(scenario_rid));
+		Scenario &scenario = scenarios[scenario_rid];
+		ERR_CONTINUE(!scenario.instances.has(instance_rid));
+
+		if (!scenario.dirty_instances.has(instance_rid)) {
+			scenario.dirty_instances.insert(instance_rid);
+			scenario.dirty_instances_array.push_back(instance_rid);
+		}
+	}
+}
+
+void RaycastOcclusionCull::free_occluder(RID p_occluder) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+	memdelete(occluder);
+	occluder_owner.free(p_occluder);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_scenario(RID p_scenario) {
+	if (scenarios.has(p_scenario)) {
+		scenarios[p_scenario].removed = false;
+	} else {
+		scenarios[p_scenario] = Scenario();
+	}
+}
+
+void RaycastOcclusionCull::remove_scenario(RID p_scenario) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+	scenario.removed = true;
+}
+
+void RaycastOcclusionCull::scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (!scenario.instances.has(p_instance)) {
+		scenario.instances[p_instance] = OccluderInstance();
+	}
+
+	OccluderInstance &instance = scenario.instances[p_instance];
+
+	if (instance.removed) {
+		instance.removed = false;
+		scenario.removed_instances.erase(p_instance);
+	}
+
+	bool changed = false;
+
+	if (instance.occluder != p_occluder) {
+		Occluder *old_occluder = occluder_owner.getornull(instance.occluder);
+		if (old_occluder) {
+			old_occluder->users.erase(InstanceID(p_scenario, p_instance));
+		}
+
+		instance.occluder = p_occluder;
+
+		if (p_occluder.is_valid()) {
+			Occluder *occluder = occluder_owner.getornull(p_occluder);
+			ERR_FAIL_COND(!occluder);
+			occluder->users.insert(InstanceID(p_scenario, p_instance));
+		}
+		changed = true;
+	}
+
+	if (instance.xform != p_xform) {
+		scenario.instances[p_instance].xform = p_xform;
+		changed = true;
+	}
+
+	if (instance.enabled != p_enabled) {
+		instance.enabled = p_enabled;
+		scenario.dirty = true; // The scenario needs a scene re-build, but the instance doesn't need update
+	}
+
+	if (changed && !scenario.dirty_instances.has(p_instance)) {
+		scenario.dirty_instances.insert(p_instance);
+		scenario.dirty_instances_array.push_back(p_instance);
+		scenario.dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::scenario_remove_instance(RID p_scenario, RID p_instance) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (scenario.instances.has(p_instance)) {
+		OccluderInstance &instance = scenario.instances[p_instance];
+
+		if (!instance.removed) {
+			Occluder *occluder = occluder_owner.getornull(instance.occluder);
+			if (occluder) {
+				occluder->users.erase(InstanceID(p_scenario, p_instance));
+			}
+
+			scenario.removed_instances.push_back(p_instance);
+			instance.removed = true;
+		}
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance_thread(int p_idx, RID *p_instances) {
+	_update_dirty_instance(p_idx, p_instances, nullptr);
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool) {
+	OccluderInstance *occ_inst = instances.getptr(p_instances[p_idx]);
+
+	if (!occ_inst) {
+		return;
+	}
+
+	Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+	if (!occ) {
+		return;
+	}
+
+	int vertices_size = occ->vertices.size();
+
+	// Embree requires the last element to be readable by a 16-byte SSE load instruction, so we add padding to be safe.
+	occ_inst->xformed_vertices.resize(vertices_size + 1);
+
+	const Vector3 *read_ptr = occ->vertices.ptr();
+	Vector3 *write_ptr = occ_inst->xformed_vertices.ptr();
+
+	if (p_thread_pool && vertices_size > 1024) {
+		TransformThreadData td;
+		td.xform = occ_inst->xform;
+		td.read = read_ptr;
+		td.write = write_ptr;
+		td.vertex_count = vertices_size;
+		td.thread_count = p_thread_pool->get_thread_count();
+		p_thread_pool->do_work(td.thread_count, this, &Scenario::_transform_vertices_thread, &td);
+	} else {
+		_transform_vertices_range(read_ptr, write_ptr, occ_inst->xform, 0, vertices_size);
+	}
+
+	occ_inst->indices.resize(occ->indices.size());
+	memcpy(occ_inst->indices.ptr(), occ->indices.ptr(), occ->indices.size() * sizeof(int32_t));
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data) {
+	uint32_t vertex_total = p_data->vertex_count;
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * vertex_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? vertex_total : ((p_thread + 1) * vertex_total / total_threads);
+	_transform_vertices_range(p_data->read, p_data->write, p_data->xform, from, to);
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to) {
+	for (int i = p_from; i < p_to; i++) {
+		p_write[i] = p_xform.xform(p_read[i]);
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_commit_scene(void *p_ud) {
+	Scenario *scenario = (Scenario *)p_ud;
+	int commit_idx = 1 - (scenario->current_scene_idx);
+	rtcCommitScene(scenario->ebr_scene[commit_idx]);
+	scenario->commit_done = true;
+}
+
+bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) {
+	ERR_FAIL_COND_V(singleton == nullptr, false);
+
+	if (commit_thread == nullptr) {
+		commit_thread = memnew(Thread);
+	}
+
+	if (commit_thread->is_started()) {
+		if (commit_done) {
+			commit_thread->wait_to_finish();
+			current_scene_idx = 1 - current_scene_idx;
+		} else {
+			return false;
+		}
+	}
+
+	if (removed) {
+		if (ebr_scene[0]) {
+			rtcReleaseScene(ebr_scene[0]);
+		}
+		if (ebr_scene[1]) {
+			rtcReleaseScene(ebr_scene[1]);
+		}
+		return true;
+	}
+
+	if (!dirty && removed_instances.is_empty() && dirty_instances_array.is_empty()) {
+		return false;
+	}
+
+	for (unsigned int i = 0; i < removed_instances.size(); i++) {
+		instances.erase(removed_instances[i]);
+	}
+
+	if (dirty_instances_array.size() / p_thread_pool.get_thread_count() > 128) {
+		// Lots of instances, use per-instance threading
+		p_thread_pool.do_work(dirty_instances_array.size(), this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr());
+	} else {
+		// Few instances, use threading on the vertex transforms
+		for (unsigned int i = 0; i < dirty_instances_array.size(); i++) {
+			_update_dirty_instance(i, dirty_instances_array.ptr(), &p_thread_pool);
+		}
+	}
+
+	dirty_instances.clear();
+	dirty_instances_array.clear();
+	removed_instances.clear();
+
+	if (raycast_singleton->ebr_device == nullptr) {
+		raycast_singleton->_init_embree();
+	}
+
+	int next_scene_idx = 1 - current_scene_idx;
+	RTCScene &next_scene = ebr_scene[next_scene_idx];
+
+	if (next_scene) {
+		rtcReleaseScene(next_scene);
+	}
+
+	next_scene = rtcNewScene(raycast_singleton->ebr_device);
+	rtcSetSceneBuildQuality(next_scene, RTCBuildQuality(raycast_singleton->build_quality));
+
+	const RID *inst_rid = nullptr;
+	while ((inst_rid = instances.next(inst_rid))) {
+		OccluderInstance *occ_inst = instances.getptr(*inst_rid);
+		Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+		if (!occ || !occ_inst->enabled) {
+			continue;
+		}
+
+		RTCGeometry geom = rtcNewGeometry(raycast_singleton->ebr_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, occ_inst->xformed_vertices.ptr(), 0, sizeof(Vector3), occ_inst->xformed_vertices.size());
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, occ_inst->indices.ptr(), 0, sizeof(uint32_t) * 3, occ_inst->indices.size() / 3);
+		rtcCommitGeometry(geom);
+		rtcAttachGeometry(next_scene, geom);
+		rtcReleaseGeometry(geom);
+	}
+
+	dirty = false;
+	commit_done = false;
+	commit_thread->start(&Scenario::_commit_scene, this);
+	return false;
+}
+
+void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThreadData *p_raycast_data) const {
+	RTCIntersectContext ctx;
+	rtcInitIntersectContext(&ctx);
+	ctx.flags = RTC_INTERSECT_CONTEXT_FLAG_COHERENT;
+
+	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
+}
+
+void RaycastOcclusionCull::Scenario::raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const {
+	ERR_FAIL_COND(singleton == nullptr);
+	if (raycast_singleton->ebr_device == nullptr) {
+		return; // Embree is initialized on demand when there is some scenario with occluders in it.
+	}
+
+	if (ebr_scene[current_scene_idx] == nullptr) {
+		return;
+	}
+
+	RaycastThreadData td;
+	td.rays = r_rays.ptr();
+	td.masks = p_valid_masks.ptr();
+
+	p_thread_pool.do_work(r_rays.size(), this, &Scenario::_raycast, &td);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_buffer(RID p_buffer) {
+	ERR_FAIL_COND(buffers.has(p_buffer));
+	buffers[p_buffer] = RaycastHZBuffer();
+}
+
+void RaycastOcclusionCull::remove_buffer(RID p_buffer) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers.erase(p_buffer);
+}
+
+void RaycastOcclusionCull::buffer_set_scenario(RID p_buffer, RID p_scenario) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	ERR_FAIL_COND(p_scenario.is_valid() && !scenarios.has(p_scenario));
+	buffers[p_buffer].scenario_rid = p_scenario;
+}
+
+void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers[p_buffer].resize(p_size);
+}
+
+void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {
+	if (!buffers.has(p_buffer)) {
+		return;
+	}
+
+	RaycastHZBuffer &buffer = buffers[p_buffer];
+
+	if (buffer.is_empty() || !scenarios.has(buffer.scenario_rid)) {
+		return;
+	}
+
+	Scenario &scenario = scenarios[buffer.scenario_rid];
+
+	bool removed = scenario.update(p_thread_pool);
+
+	if (removed) {
+		scenarios.erase(buffer.scenario_rid);
+		return;
+	}
+
+	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
+
+	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks, p_thread_pool);
+	buffer.sort_rays();
+	buffer.update_mips();
+}
+
+RaycastOcclusionCull::HZBuffer *RaycastOcclusionCull::buffer_get_ptr(RID p_buffer) {
+	if (!buffers.has(p_buffer)) {
+		return nullptr;
+	}
+	return &buffers[p_buffer];
+}
+
+RID RaycastOcclusionCull::buffer_get_debug_texture(RID p_buffer) {
+	ERR_FAIL_COND_V(!buffers.has(p_buffer), RID());
+	return buffers[p_buffer].get_debug_texture();
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	if (build_quality == p_quality) {
+		return;
+	}
+
+	build_quality = p_quality;
+
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		scenarios[*scenario_rid].dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::_init_embree() {
+#ifdef __SSE2__
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+	String settings = vformat("threads=%d", MAX(1, OS::get_singleton()->get_processor_count() - 2));
+	ebr_device = rtcNewDevice(settings.utf8().ptr());
+}
+
+RaycastOcclusionCull::RaycastOcclusionCull() {
+	raycast_singleton = this;
+	int default_quality = GLOBAL_GET("rendering/occlusion_culling/bvh_build_quality");
+	build_quality = RS::ViewportOcclusionCullingBuildQuality(default_quality);
+}
+
+RaycastOcclusionCull::~RaycastOcclusionCull() {
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		Scenario &scenario = scenarios[*scenario_rid];
+		if (scenario.commit_thread) {
+			scenario.commit_thread->wait_to_finish();
+			memdelete(scenario.commit_thread);
+		}
+	}
+
+	if (ebr_device != nullptr) {
+#ifdef __SSE2__
+		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+#endif
+		rtcReleaseDevice(ebr_device);
+	}
+
+	raycast_singleton = nullptr;
+}
diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h
new file mode 100644
index 0000000000..acaceb9459
--- /dev/null
+++ b/modules/raycast/raycast_occlusion_cull.h
@@ -0,0 +1,184 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.h                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUSION_CULL_RAYCASTER_H
+#define OCCLUSION_CULL_RAYCASTER_H
+
+#include "core/io/image.h"
+#include "core/math/camera_matrix.h"
+#include "core/object/object.h"
+#include "core/object/reference.h"
+#include "core/templates/local_vector.h"
+#include "core/templates/rid_owner.h"
+#include "scene/resources/mesh.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
+
+#include <embree3/rtcore.h>
+
+class RaycastOcclusionCull : public RendererSceneOcclusionCull {
+	typedef RTCRayHit16 RayPacket;
+
+public:
+	class RaycastHZBuffer : public HZBuffer {
+	private:
+		Size2i packs_size;
+
+		struct CameraRayThreadData {
+			CameraMatrix camera_matrix;
+			Transform camera_transform;
+			bool camera_orthogonal;
+			int thread_count;
+			Size2i buffer_size;
+		};
+
+		void _camera_rays_threaded(uint32_t p_thread, CameraRayThreadData *p_data);
+		void _generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to);
+
+	public:
+		LocalVector<RayPacket> camera_rays;
+		LocalVector<uint32_t> camera_ray_masks;
+		RID scenario_rid;
+
+		virtual void clear() override;
+		virtual void resize(const Size2i &p_size) override;
+		void sort_rays();
+		void update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
+	};
+
+private:
+	struct InstanceID {
+		RID scenario;
+		RID instance;
+
+		bool operator<(const InstanceID &rhs) const {
+			if (instance == rhs.instance) {
+				return rhs.scenario < scenario;
+			}
+			return instance < rhs.instance;
+		}
+
+		InstanceID() {}
+		InstanceID(RID s, RID i) :
+				scenario(s), instance(i) {}
+	};
+
+	struct Occluder {
+		PackedVector3Array vertices;
+		PackedInt32Array indices;
+		Set<InstanceID> users;
+	};
+
+	struct OccluderInstance {
+		RID occluder;
+		LocalVector<uint32_t> indices;
+		LocalVector<Vector3> xformed_vertices;
+		Transform xform;
+		bool enabled = true;
+		bool removed = false;
+	};
+
+	struct Scenario {
+		struct RaycastThreadData {
+			RayPacket *rays;
+			const uint32_t *masks;
+		};
+
+		struct TransformThreadData {
+			uint32_t thread_count;
+			uint32_t vertex_count;
+			Transform xform;
+			const Vector3 *read;
+			Vector3 *write;
+		};
+
+		Thread *commit_thread = nullptr;
+		bool commit_done = true;
+		bool dirty = false;
+		bool removed = false;
+
+		RTCScene ebr_scene[2] = { nullptr, nullptr };
+		int current_scene_idx = 0;
+
+		HashMap<RID, OccluderInstance> instances;
+		Set<RID> dirty_instances; // To avoid duplicates
+		LocalVector<RID> dirty_instances_array; // To iterate and split into threads
+		LocalVector<RID> removed_instances;
+
+		void _update_dirty_instance_thread(int p_idx, RID *p_instances);
+		void _update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool);
+		void _transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data);
+		void _transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to);
+		static void _commit_scene(void *p_ud);
+		bool update(ThreadWorkPool &p_thread_pool);
+
+		void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
+		void raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const;
+	};
+
+	static RaycastOcclusionCull *raycast_singleton;
+
+	static const int TILE_SIZE = 4;
+	static const int TILE_RAYS = TILE_SIZE * TILE_SIZE;
+
+	RTCDevice ebr_device = nullptr;
+	RID_PtrOwner<Occluder> occluder_owner;
+	HashMap<RID, Scenario> scenarios;
+	HashMap<RID, RaycastHZBuffer> buffers;
+	RS::ViewportOcclusionCullingBuildQuality build_quality;
+
+	void _init_embree();
+
+public:
+	virtual bool is_occluder(RID p_rid) override;
+	virtual RID occluder_allocate() override;
+	virtual void occluder_initialize(RID p_occluder) override;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) override;
+	virtual void free_occluder(RID p_occluder) override;
+
+	virtual void add_scenario(RID p_scenario) override;
+	virtual void remove_scenario(RID p_scenario) override;
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) override;
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) override;
+
+	virtual void add_buffer(RID p_buffer) override;
+	virtual void remove_buffer(RID p_buffer) override;
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) override;
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) override;
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) override;
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) override;
+	virtual RID buffer_get_debug_texture(RID p_buffer) override;
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) override;
+
+	RaycastOcclusionCull();
+	~RaycastOcclusionCull();
+};
+
+#endif // OCCLUSION_CULL_RAYCASTER_H
diff --git a/modules/raycast/register_types.cpp b/modules/raycast/register_types.cpp
new file mode 100644
index 0000000000..78ca91309f
--- /dev/null
+++ b/modules/raycast/register_types.cpp
@@ -0,0 +1,49 @@
+/*************************************************************************/
+/*  register_types.cpp                                                   */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "register_types.h"
+
+#include "lightmap_raycaster.h"
+#include "raycast_occlusion_cull.h"
+
+RaycastOcclusionCull *raycast_occlusion_cull = nullptr;
+
+void register_raycast_types() {
+#ifdef TOOLS_ENABLED
+	LightmapRaycasterEmbree::make_default_raycaster();
+#endif
+	raycast_occlusion_cull = memnew(RaycastOcclusionCull);
+}
+
+void unregister_raycast_types() {
+	if (raycast_occlusion_cull) {
+		memdelete(raycast_occlusion_cull);
+	}
+}
diff --git a/core/os/copymem.h b/modules/raycast/register_types.h
index 6fd559356c..789604a491 100644
--- a/core/os/copymem.h
+++ b/modules/raycast/register_types.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  copymem.h                                                            */
+/*  register_types.h                                                     */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,23 +28,5 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef COPYMEM_H
-#define COPYMEM_H
-
-#include "core/typedefs.h"
-
-#ifdef PLATFORM_COPYMEM
-
-#include "platform_copymem.h" // included from platform/<current_platform>/platform_copymem.h"
-
-#else
-
-#include <string.h>
-
-#define copymem(to, from, count) memcpy(to, from, count)
-#define zeromem(to, count) memset(to, 0, count)
-#define movemem(to, from, count) memmove(to, from, count)
-
-#endif
-
-#endif // COPYMEM_H
+void register_raycast_types();
+void unregister_raycast_types();
diff --git a/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp b/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
index 6732078efc..e8e481de2d 100644
--- a/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
+++ b/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
@@ -204,7 +204,7 @@ void AudioStreamOGGVorbis::set_data(const Vector<uint8_t> &p_data) {
 			clear_data();
 
 			data = memalloc(src_data_len);
-			copymem(data, src_datar, src_data_len);
+			memcpy(data, src_datar, src_data_len);
 			data_len = src_data_len;
 
 			break;
@@ -221,7 +221,7 @@ Vector<uint8_t> AudioStreamOGGVorbis::get_data() const {
 		vdata.resize(data_len);
 		{
 			uint8_t *w = vdata.ptrw();
-			copymem(w, data, data_len);
+			memcpy(w, data, data_len);
 		}
 	}
 
diff --git a/modules/theora/video_stream_theora.cpp b/modules/theora/video_stream_theora.cpp
index 19f26c87cd..54f5b3f424 100644
--- a/modules/theora/video_stream_theora.cpp
+++ b/modules/theora/video_stream_theora.cpp
@@ -225,7 +225,7 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 			/* identify the codec: try theora */
 			if (!theora_p && th_decode_headerin(&ti, &tc, &ts, &op) >= 0) {
 				/* it is theora */
-				copymem(&to, &test, sizeof(test));
+				memcpy(&to, &test, sizeof(test));
 				theora_p = 1;
 			} else if (!vorbis_p && vorbis_synthesis_headerin(&vi, &vc, &op) >= 0) {
 				/* it is vorbis */
@@ -238,7 +238,7 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 
 					audio_track_skip--;
 				} else {
-					copymem(&vo, &test, sizeof(test));
+					memcpy(&vo, &test, sizeof(test));
 					vorbis_p = 1;
 				}
 			} else {
diff --git a/modules/visual_script/visual_script.cpp b/modules/visual_script/visual_script.cpp
index 6d5fff88d9..05ecdee9fc 100644
--- a/modules/visual_script/visual_script.cpp
+++ b/modules/visual_script/visual_script.cpp
@@ -1537,7 +1537,7 @@ Variant VisualScriptInstance::_call_internal(const StringName &p_method, void *p
 				state->flow_stack_pos = flow_stack_pos;
 				state->stack.resize(p_stack_size);
 				state->pass = p_pass;
-				copymem(state->stack.ptrw(), p_stack, p_stack_size);
+				memcpy(state->stack.ptrw(), p_stack, p_stack_size);
 				// Step 2, run away, return directly.
 				r_error.error = Callable::CallError::CALL_OK;
 
@@ -1802,7 +1802,7 @@ Variant VisualScriptInstance::call(const StringName &p_method, const Variant **p
 		sequence_bits[i] = false; // All starts as false.
 	}
 
-	zeromem(pass_stack, f->pass_stack_size * sizeof(int));
+	memset(pass_stack, 0, f->pass_stack_size * sizeof(int));
 
 	Map<int, VisualScriptNodeInstance *>::Element *E = instances.find(f->node);
 	if (!E) {
diff --git a/modules/webp/image_loader_webp.cpp b/modules/webp/image_loader_webp.cpp
index b304c4824f..6e62840a3e 100644
--- a/modules/webp/image_loader_webp.cpp
+++ b/modules/webp/image_loader_webp.cpp
@@ -68,7 +68,7 @@ static Vector<uint8_t> _webp_lossy_pack(const Ref<Image> &p_image, float p_quali
 	w[1] = 'E';
 	w[2] = 'B';
 	w[3] = 'P';
-	copymem(&w[4], dst_buff, dst_size);
+	memcpy(&w[4], dst_buff, dst_size);
 	free(dst_buff);
 
 	return dst;
diff --git a/modules/websocket/packet_buffer.h b/modules/websocket/packet_buffer.h
index ed756363cf..e99a379767 100644
--- a/modules/websocket/packet_buffer.h
+++ b/modules/websocket/packet_buffer.h
@@ -31,7 +31,6 @@
 #ifndef PACKET_BUFFER_H
 #define PACKET_BUFFER_H
 
-#include "core/os/copymem.h"
 #include "core/templates/ring_buffer.h"
 
 template <class T>
@@ -66,7 +65,7 @@ public:
 		if (p_info) {
 			_Packet p;
 			p.size = p_size;
-			copymem(&p.info, p_info, sizeof(T));
+			memcpy(&p.info, p_info, sizeof(T));
 			_packets.write(p);
 		}
 
@@ -86,7 +85,7 @@ public:
 		ERR_FAIL_COND_V(p_bytes < (int)p.size, ERR_OUT_OF_MEMORY);
 
 		r_read = p.size;
-		copymem(r_info, &p.info, sizeof(T));
+		memcpy(r_info, &p.info, sizeof(T));
 		_payload.read(r_payload, p.size);
 		return OK;
 	}
diff --git a/modules/websocket/websocket_multiplayer_peer.cpp b/modules/websocket/websocket_multiplayer_peer.cpp
index 758ed66c80..369d53bca4 100644
--- a/modules/websocket/websocket_multiplayer_peer.cpp
+++ b/modules/websocket/websocket_multiplayer_peer.cpp
@@ -168,10 +168,10 @@ Vector<uint8_t> WebSocketMultiplayerPeer::_make_pkt(uint8_t p_type, int32_t p_fr
 	out.resize(PROTO_SIZE + p_data_size);
 
 	uint8_t *w = out.ptrw();
-	copymem(&w[0], &p_type, 1);
-	copymem(&w[1], &p_from, 4);
-	copymem(&w[5], &p_to, 4);
-	copymem(&w[PROTO_SIZE], p_data, p_data_size);
+	memcpy(&w[0], &p_type, 1);
+	memcpy(&w[1], &p_from, 4);
+	memcpy(&w[5], &p_to, 4);
+	memcpy(&w[PROTO_SIZE], p_data, p_data_size);
 
 	return out;
 }
@@ -211,7 +211,7 @@ void WebSocketMultiplayerPeer::_store_pkt(int32_t p_source, int32_t p_dest, cons
 	packet.size = p_data_size;
 	packet.source = p_source;
 	packet.destination = p_dest;
-	copymem(packet.data, &p_data[PROTO_SIZE], p_data_size);
+	memcpy(packet.data, &p_data[PROTO_SIZE], p_data_size);
 	_incoming_packets.push_back(packet);
 	emit_signal("peer_packet", p_source);
 }
@@ -263,9 +263,9 @@ void WebSocketMultiplayerPeer::_process_multiplayer(Ref<WebSocketPeer> p_peer, u
 	uint8_t type = 0;
 	uint32_t from = 0;
 	int32_t to = 0;
-	copymem(&type, in_buffer, 1);
-	copymem(&from, &in_buffer[1], 4);
-	copymem(&to, &in_buffer[5], 4);
+	memcpy(&type, in_buffer, 1);
+	memcpy(&from, &in_buffer[1], 4);
+	memcpy(&to, &in_buffer[5], 4);
 
 	if (is_server()) { // Server can resend
 
@@ -299,7 +299,7 @@ void WebSocketMultiplayerPeer::_process_multiplayer(Ref<WebSocketPeer> p_peer, u
 		// System message
 		ERR_FAIL_COND(data_size < 4);
 		int id = 0;
-		copymem(&id, &in_buffer[PROTO_SIZE], 4);
+		memcpy(&id, &in_buffer[PROTO_SIZE], 4);
 
 		switch (type) {
 			case SYS_ADD: // Add peer
diff --git a/modules/webxr/native/library_godot_webxr.js b/modules/webxr/native/library_godot_webxr.js
index 8e9ef8a73c..6e19a8ac6e 100644
--- a/modules/webxr/native/library_godot_webxr.js
+++ b/modules/webxr/native/library_godot_webxr.js
@@ -71,10 +71,8 @@ const GodotWebXR = {
 			// enabled or disabled. When using the WebXR API Emulator, this
 			// gets picked up automatically, however, in the Oculus Browser
 			// on the Quest, we need to pause and resume the main loop.
-			Browser.pauseAsyncCallbacks();
 			Browser.mainLoop.pause();
 			window.setTimeout(function () {
-				Browser.resumeAsyncCallbacks();
 				Browser.mainLoop.resume();
 			}, 0);
 		},
diff --git a/platform/android/export/export.cpp b/platform/android/export/export.cpp
index ba92b81b49..cd3f00f935 100644
--- a/platform/android/export/export.cpp
+++ b/platform/android/export/export.cpp
@@ -2247,7 +2247,7 @@ public:
 				}
 				r_command_line_flags.resize(base + 4 + length);
 				encode_uint32(length, &r_command_line_flags.write[base]);
-				copymem(&r_command_line_flags.write[base + 4], command_line_argument.ptr(), length);
+				memcpy(&r_command_line_flags.write[base + 4], command_line_argument.ptr(), length);
 			}
 		}
 	}
diff --git a/platform/android/java/app/config.gradle b/platform/android/java/app/config.gradle
index ad9a19e2af..b278d15bdf 100644
--- a/platform/android/java/app/config.gradle
+++ b/platform/android/java/app/config.gradle
@@ -3,7 +3,7 @@ ext.versions = [
     compileSdk         : 29,
     minSdk             : 18,
     targetSdk          : 29,
-    buildTools         : '30.0.1',
+    buildTools         : '30.0.3',
     supportCoreUtils   : '1.0.0',
     kotlinVersion      : '1.4.10',
     v4Support          : '1.0.0',
diff --git a/platform/javascript/http_client_javascript.cpp b/platform/javascript/http_client_javascript.cpp
index b79c965854..a6cf4b0eb8 100644
--- a/platform/javascript/http_client_javascript.cpp
+++ b/platform/javascript/http_client_javascript.cpp
@@ -209,7 +209,7 @@ PackedByteArray HTTPClient::read_response_body_chunk() {
 		return chunk;
 	}
 	chunk.resize(read);
-	copymem(chunk.ptrw(), response_buffer.ptr(), read);
+	memcpy(chunk.ptrw(), response_buffer.ptr(), read);
 	return chunk;
 }
 
diff --git a/platform/osx/export/export.cpp b/platform/osx/export/export.cpp
index aca9471849..51204bc8f6 100644
--- a/platform/osx/export/export.cpp
+++ b/platform/osx/export/export.cpp
@@ -215,7 +215,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 			if ((p_source.ptr()[(i + 1) * 4 + p_ch] == cur) && (p_source.ptr()[(i + 2) * 4 + p_ch] == cur)) {
 				if (buf_size > 0) {
 					result.write[res_size++] = (uint8_t)(buf_size - 1);
-					copymem(&result.write[res_size], &buf, buf_size);
+					memcpy(&result.write[res_size], &buf, buf_size);
 					res_size += buf_size;
 					buf_size = 0;
 				}
@@ -241,7 +241,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 				buf[buf_size++] = cur;
 				if (buf_size == 128) {
 					result.write[res_size++] = (uint8_t)(buf_size - 1);
-					copymem(&result.write[res_size], &buf, buf_size);
+					memcpy(&result.write[res_size], &buf, buf_size);
 					res_size += buf_size;
 					buf_size = 0;
 				}
@@ -249,7 +249,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 		} else {
 			buf[buf_size++] = cur;
 			result.write[res_size++] = (uint8_t)(buf_size - 1);
-			copymem(&result.write[res_size], &buf, buf_size);
+			memcpy(&result.write[res_size], &buf, buf_size);
 			res_size += buf_size;
 			buf_size = 0;
 		}
@@ -259,7 +259,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 
 	int ofs = p_dest.size();
 	p_dest.resize(p_dest.size() + res_size);
-	copymem(&p_dest.write[ofs], result.ptr(), res_size);
+	memcpy(&p_dest.write[ofs], result.ptr(), res_size);
 }
 
 void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_t> &p_data) {
@@ -318,7 +318,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 			memdelete(f);
 			len += 8;
 			len = BSWAP32(len);
-			copymem(&data.write[ofs], icon_infos[i].name, 4);
+			memcpy(&data.write[ofs], icon_infos[i].name, 4);
 			encode_uint32(len, &data.write[ofs + 4]);
 
 			// Clean up generated file.
@@ -338,7 +338,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 
 				int len = data.size() - ofs;
 				len = BSWAP32(len);
-				copymem(&data.write[ofs], icon_infos[i].name, 4);
+				memcpy(&data.write[ofs], icon_infos[i].name, 4);
 				encode_uint32(len, &data.write[ofs + 4]);
 			}
 
@@ -353,7 +353,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 				}
 				len += 8;
 				len = BSWAP32(len);
-				copymem(&data.write[ofs], icon_infos[i].mask_name, 4);
+				memcpy(&data.write[ofs], icon_infos[i].mask_name, 4);
 				encode_uint32(len, &data.write[ofs + 4]);
 			}
 		}
diff --git a/platform/uwp/export/export.cpp b/platform/uwp/export/export.cpp
index 800a728033..217c119978 100644
--- a/platform/uwp/export/export.cpp
+++ b/platform/uwp/export/export.cpp
@@ -1336,7 +1336,7 @@ public:
 			int base = clf.size();
 			clf.resize(base + 4 + txt.length());
 			encode_uint32(txt.length(), &clf.write[base]);
-			copymem(&clf.write[base + 4], txt.ptr(), txt.length());
+			memcpy(&clf.write[base + 4], txt.ptr(), txt.length());
 			print_line(itos(i) + " param: " + cl[i]);
 		}
 
diff --git a/platform/windows/display_server_windows.cpp b/platform/windows/display_server_windows.cpp
index 86f20f1dd7..907d2b75d2 100644
--- a/platform/windows/display_server_windows.cpp
+++ b/platform/windows/display_server_windows.cpp
@@ -332,7 +332,7 @@ static BOOL CALLBACK _MonitorEnumProcUsableSize(HMONITOR hMonitor, HDC hdcMonito
 	EnumRectData *data = (EnumRectData *)dwData;
 	if (data->count == data->screen) {
 		MONITORINFO minfo;
-		zeromem(&minfo, sizeof(MONITORINFO));
+		memset(&minfo, 0, sizeof(MONITORINFO));
 		minfo.cbSize = sizeof(MONITORINFO);
 		GetMonitorInfoA(hMonitor, &minfo);
 
diff --git a/scene/2d/cpu_particles_2d.cpp b/scene/2d/cpu_particles_2d.cpp
index 5f2efeb8ca..1578643d14 100644
--- a/scene/2d/cpu_particles_2d.cpp
+++ b/scene/2d/cpu_particles_2d.cpp
@@ -976,7 +976,7 @@ void CPUParticles2D::_update_particle_data_buffer() {
 			ptr[7] = t.elements[2][1];
 
 		} else {
-			zeromem(ptr, sizeof(float) * 8);
+			memset(ptr, 0, sizeof(float) * 8);
 		}
 
 		Color c = r[idx].color;
@@ -1080,7 +1080,7 @@ void CPUParticles2D::_notification(int p_what) {
 						ptr[7] = t.elements[2][1];
 
 					} else {
-						zeromem(ptr, sizeof(float) * 8);
+						memset(ptr, 0, sizeof(float) * 8);
 					}
 
 					ptr += 16;
diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp
index 2e1b77dfe5..ef648a126e 100644
--- a/scene/3d/baked_lightmap.cpp
+++ b/scene/3d/baked_lightmap.cpp
@@ -619,10 +619,6 @@ void BakedLightmap::_gen_new_positions_from_octree(const GenProbesOctree *p_cell
 }
 
 BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, String p_image_data_path, Lightmapper::BakeStepFunc p_bake_step, void *p_bake_userdata) {
-	if (p_image_data_path == "" && (get_light_data().is_null() || !get_light_data()->get_path().is_resource_file())) {
-		return BAKE_ERROR_NO_SAVE_PATH;
-	}
-
 	if (p_image_data_path == "") {
 		if (get_light_data().is_null()) {
 			return BAKE_ERROR_NO_SAVE_PATH;
diff --git a/scene/3d/cpu_particles_3d.cpp b/scene/3d/cpu_particles_3d.cpp
index 780773bb57..aa29728c73 100644
--- a/scene/3d/cpu_particles_3d.cpp
+++ b/scene/3d/cpu_particles_3d.cpp
@@ -1050,7 +1050,7 @@ void CPUParticles3D::_update_particle_data_buffer() {
 			ptr[10] = t.basis.elements[2][2];
 			ptr[11] = t.origin.z;
 		} else {
-			zeromem(ptr, sizeof(float) * 12);
+			memset(ptr, 0, sizeof(float) * 12);
 		}
 
 		Color c = r[idx].color;
@@ -1155,7 +1155,7 @@ void CPUParticles3D::_notification(int p_what) {
 					ptr[10] = t.basis.elements[2][2];
 					ptr[11] = t.origin.z;
 				} else {
-					zeromem(ptr, sizeof(float) * 12);
+					memset(ptr, 0, sizeof(float) * 12);
 				}
 
 				ptr += 20;
diff --git a/scene/3d/lightmapper.cpp b/scene/3d/lightmapper.cpp
index c17ac52aa2..9e5078ba95 100644
--- a/scene/3d/lightmapper.cpp
+++ b/scene/3d/lightmapper.cpp
@@ -39,6 +39,15 @@ Ref<LightmapDenoiser> LightmapDenoiser::create() {
 	return Ref<LightmapDenoiser>();
 }
 
+LightmapRaycaster *(*LightmapRaycaster::create_function)() = nullptr;
+
+Ref<LightmapRaycaster> LightmapRaycaster::create() {
+	if (create_function) {
+		return Ref<LightmapRaycaster>(create_function());
+	}
+	return Ref<LightmapRaycaster>();
+}
+
 Lightmapper::CreateFunc Lightmapper::create_custom = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_gpu = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_cpu = nullptr;
diff --git a/scene/3d/lightmapper.h b/scene/3d/lightmapper.h
index a07a964c01..f63515f666 100644
--- a/scene/3d/lightmapper.h
+++ b/scene/3d/lightmapper.h
@@ -34,6 +34,16 @@
 #include "scene/resources/mesh.h"
 #include "servers/rendering/rendering_device.h"
 
+#if !defined(__aligned)
+
+#if defined(_WIN32) && defined(_MSC_VER)
+#define __aligned(...) __declspec(align(__VA_ARGS__))
+#else
+#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#endif
+
 class LightmapDenoiser : public Reference {
 	GDCLASS(LightmapDenoiser, Reference)
 protected:
@@ -44,6 +54,73 @@ public:
 	static Ref<LightmapDenoiser> create();
 };
 
+class LightmapRaycaster : public Reference {
+	GDCLASS(LightmapRaycaster, Reference)
+protected:
+	static LightmapRaycaster *(*create_function)();
+
+public:
+	// compatible with embree3 rays
+	struct __aligned(16) Ray {
+		const static unsigned int INVALID_GEOMETRY_ID = ((unsigned int)-1); // from rtcore_common.h
+
+		/*! Default construction does nothing. */
+		_FORCE_INLINE_ Ray() :
+				geomID(INVALID_GEOMETRY_ID) {}
+
+		/*! Constructs a ray from origin, direction, and ray segment. Near
+		 *  has to be smaller than far. */
+		_FORCE_INLINE_ Ray(const Vector3 &org,
+				const Vector3 &dir,
+				float tnear = 0.0f,
+				float tfar = INFINITY) :
+				org(org),
+				tnear(tnear),
+				dir(dir),
+				time(0.0f),
+				tfar(tfar),
+				mask(-1),
+				u(0.0),
+				v(0.0),
+				primID(INVALID_GEOMETRY_ID),
+				geomID(INVALID_GEOMETRY_ID),
+				instID(INVALID_GEOMETRY_ID) {}
+
+		/*! Tests if we hit something. */
+		_FORCE_INLINE_ explicit operator bool() const { return geomID != INVALID_GEOMETRY_ID; }
+
+	public:
+		Vector3 org; //!< Ray origin + tnear
+		float tnear; //!< Start of ray segment
+		Vector3 dir; //!< Ray direction + tfar
+		float time; //!< Time of this ray for motion blur.
+		float tfar; //!< End of ray segment
+		unsigned int mask; //!< used to mask out objects during traversal
+		unsigned int id; //!< ray ID
+		unsigned int flags; //!< ray flags
+
+		Vector3 normal; //!< Not normalized geometry normal
+		float u; //!< Barycentric u coordinate of hit
+		float v; //!< Barycentric v coordinate of hit
+		unsigned int primID; //!< primitive ID
+		unsigned int geomID; //!< geometry ID
+		unsigned int instID; //!< instance ID
+	};
+
+	virtual bool intersect(Ray &p_ray) = 0;
+
+	virtual void intersect(Vector<Ray> &r_rays) = 0;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) = 0;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) = 0;
+	virtual void commit() = 0;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) = 0;
+	virtual void clear_mesh_filter() = 0;
+
+	static Ref<LightmapRaycaster> create();
+};
+
 class Lightmapper : public Reference {
 	GDCLASS(Lightmapper, Reference)
 public:
diff --git a/scene/3d/mesh_instance_3d.cpp b/scene/3d/mesh_instance_3d.cpp
index 99fa8f1f3e..27d5487a1a 100644
--- a/scene/3d/mesh_instance_3d.cpp
+++ b/scene/3d/mesh_instance_3d.cpp
@@ -356,6 +356,7 @@ Ref<Material> MeshInstance3D::get_active_material(int p_surface) const {
 void MeshInstance3D::_mesh_changed() {
 	ERR_FAIL_COND(mesh.is_null());
 	surface_override_materials.resize(mesh->get_surface_count());
+	update_gizmo();
 }
 
 void MeshInstance3D::create_debug_tangents() {
diff --git a/scene/3d/occluder_instance_3d.cpp b/scene/3d/occluder_instance_3d.cpp
new file mode 100644
index 0000000000..d3a256db34
--- /dev/null
+++ b/scene/3d/occluder_instance_3d.cpp
@@ -0,0 +1,335 @@
+/*************************************************************************/
+/*  occluder_instance_3d.cpp                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d.h"
+#include "core/core_string_names.h"
+#include "scene/3d/mesh_instance_3d.h"
+
+RID Occluder3D::get_rid() const {
+	if (!occluder.is_valid()) {
+		occluder = RS::get_singleton()->occluder_create();
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	return occluder;
+}
+
+void Occluder3D::set_vertices(PackedVector3Array p_vertices) {
+	vertices = p_vertices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedVector3Array Occluder3D::get_vertices() const {
+	return vertices;
+}
+
+void Occluder3D::set_indices(PackedInt32Array p_indices) {
+	indices = p_indices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedInt32Array Occluder3D::get_indices() const {
+	return indices;
+}
+
+void Occluder3D::_update_changes() {
+	aabb = AABB();
+
+	const Vector3 *ptr = vertices.ptr();
+	for (int i = 0; i < vertices.size(); i++) {
+		aabb.expand_to(ptr[i]);
+	}
+
+	debug_lines.clear();
+	debug_mesh.unref();
+
+	emit_changed();
+}
+
+Vector<Vector3> Occluder3D::get_debug_lines() const {
+	if (!debug_lines.is_empty()) {
+		return debug_lines;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return Vector<Vector3>();
+	}
+
+	for (int i = 0; i < indices.size() / 3; i++) {
+		for (int j = 0; j < 3; j++) {
+			int a = indices[i * 3 + j];
+			int b = indices[i * 3 + (j + 1) % 3];
+			ERR_FAIL_INDEX_V_MSG(a, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			ERR_FAIL_INDEX_V_MSG(b, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			debug_lines.push_back(vertices[a]);
+			debug_lines.push_back(vertices[b]);
+		}
+	}
+	return debug_lines;
+}
+
+Ref<ArrayMesh> Occluder3D::get_debug_mesh() const {
+	if (debug_mesh.is_valid()) {
+		return debug_mesh;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return debug_mesh;
+	}
+
+	Array arrays;
+	arrays.resize(Mesh::ARRAY_MAX);
+	arrays[Mesh::ARRAY_VERTEX] = vertices;
+	arrays[Mesh::ARRAY_INDEX] = indices;
+
+	debug_mesh.instance();
+	debug_mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arrays);
+	return debug_mesh;
+}
+
+AABB Occluder3D::get_aabb() const {
+	return aabb;
+}
+
+void Occluder3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_vertices", "vertices"), &Occluder3D::set_vertices);
+	ClassDB::bind_method(D_METHOD("get_vertices"), &Occluder3D::get_vertices);
+
+	ClassDB::bind_method(D_METHOD("set_indices", "indices"), &Occluder3D::set_indices);
+	ClassDB::bind_method(D_METHOD("get_indices"), &Occluder3D::get_indices);
+
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR3_ARRAY, "vertices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_vertices", "get_vertices");
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_INT32_ARRAY, "indices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_indices", "get_indices");
+}
+
+Occluder3D::Occluder3D() {
+}
+
+Occluder3D::~Occluder3D() {
+	if (occluder.is_valid()) {
+		RS::get_singleton()->free(occluder);
+	}
+}
+/////////////////////////////////////////////////
+
+AABB OccluderInstance3D::get_aabb() const {
+	if (occluder.is_valid()) {
+		return occluder->get_aabb();
+	}
+	return AABB();
+}
+
+Vector<Face3> OccluderInstance3D::get_faces(uint32_t p_usage_flags) const {
+	return Vector<Face3>();
+}
+
+void OccluderInstance3D::set_occluder(const Ref<Occluder3D> &p_occluder) {
+	if (occluder == p_occluder) {
+		return;
+	}
+
+	if (occluder.is_valid()) {
+		occluder->disconnect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	}
+
+	occluder = p_occluder;
+
+	if (occluder.is_valid()) {
+		set_base(occluder->get_rid());
+		occluder->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	} else {
+		set_base(RID());
+	}
+
+	update_gizmo();
+}
+
+void OccluderInstance3D::_occluder_changed() {
+	update_gizmo();
+}
+
+Ref<Occluder3D> OccluderInstance3D::get_occluder() const {
+	return occluder;
+}
+
+void OccluderInstance3D::set_bake_mask(uint32_t p_mask) {
+	bake_mask = p_mask;
+}
+
+uint32_t OccluderInstance3D::get_bake_mask() const {
+	return bake_mask;
+}
+
+void OccluderInstance3D::set_bake_mask_bit(int p_layer, bool p_enable) {
+	ERR_FAIL_INDEX(p_layer, 32);
+	if (p_enable) {
+		set_bake_mask(bake_mask | (1 << p_layer));
+	} else {
+		set_bake_mask(bake_mask & (~(1 << p_layer)));
+	}
+}
+
+bool OccluderInstance3D::get_bake_mask_bit(int p_layer) const {
+	ERR_FAIL_INDEX_V(p_layer, 32, false);
+	return (bake_mask & (1 << p_layer));
+}
+
+bool OccluderInstance3D::_bake_material_check(Ref<Material> p_material) {
+	StandardMaterial3D *standard_mat = Object::cast_to<StandardMaterial3D>(p_material.ptr());
+	if (standard_mat && standard_mat->get_transparency() != StandardMaterial3D::TRANSPARENCY_DISABLED) {
+		return false;
+	}
+	return true;
+}
+
+void OccluderInstance3D::_bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices) {
+	MeshInstance3D *mi = Object::cast_to<MeshInstance3D>(p_node);
+	if (mi && mi->is_visible_in_tree()) {
+		Ref<Mesh> mesh = mi->get_mesh();
+		bool valid = true;
+
+		if (mesh.is_null()) {
+			valid = false;
+		}
+
+		if (valid && !_bake_material_check(mi->get_material_override())) {
+			valid = false;
+		}
+
+		if ((mi->get_layer_mask() & bake_mask) == 0) {
+			valid = false;
+		}
+
+		if (valid) {
+			Transform global_to_local = get_global_transform().affine_inverse() * mi->get_global_transform();
+
+			for (int i = 0; i < mesh->get_surface_count(); i++) {
+				if (mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES) {
+					continue;
+				}
+
+				if (mi->get_surface_override_material(i).is_valid()) {
+					if (!_bake_material_check(mi->get_surface_override_material(i))) {
+						continue;
+					}
+				} else {
+					if (!_bake_material_check(mesh->surface_get_material(i))) {
+						continue;
+					}
+				}
+
+				Array arrays = mesh->surface_get_arrays(i);
+
+				int vertex_offset = r_vertices.size();
+				PackedVector3Array vertices = arrays[Mesh::ARRAY_VERTEX];
+				r_vertices.resize(r_vertices.size() + vertices.size());
+
+				Vector3 *vtx_ptr = r_vertices.ptrw();
+				for (int j = 0; j < vertices.size(); j++) {
+					vtx_ptr[vertex_offset + j] = global_to_local.xform(vertices[j]);
+				}
+
+				int index_offset = r_indices.size();
+				PackedInt32Array indices = arrays[Mesh::ARRAY_INDEX];
+				r_indices.resize(r_indices.size() + indices.size());
+
+				int *idx_ptr = r_indices.ptrw();
+				for (int j = 0; j < indices.size(); j++) {
+					idx_ptr[index_offset + j] = vertex_offset + indices[j];
+				}
+			}
+		}
+	}
+
+	for (int i = 0; i < p_node->get_child_count(); i++) {
+		Node *child = p_node->get_child(i);
+		if (!child->get_owner()) {
+			continue; //maybe a helper
+		}
+
+		_bake_node(child, r_vertices, r_indices);
+	}
+}
+
+OccluderInstance3D::BakeError OccluderInstance3D::bake(Node *p_from_node, String p_occluder_path) {
+	if (p_occluder_path == "") {
+		if (get_occluder().is_null()) {
+			return BAKE_ERROR_NO_SAVE_PATH;
+		}
+	}
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	_bake_node(p_from_node, vertices, indices);
+
+	if (vertices.is_empty() || indices.is_empty()) {
+		return BAKE_ERROR_NO_MESHES;
+	}
+
+	Ref<Occluder3D> occ;
+	if (get_occluder().is_valid()) {
+		occ = get_occluder();
+	} else {
+		occ.instance();
+		occ->set_path(p_occluder_path);
+	}
+
+	occ->set_vertices(vertices);
+	occ->set_indices(indices);
+	set_occluder(occ);
+
+	return BAKE_ERROR_OK;
+}
+
+void OccluderInstance3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_bake_mask", "mask"), &OccluderInstance3D::set_bake_mask);
+	ClassDB::bind_method(D_METHOD("get_bake_mask"), &OccluderInstance3D::get_bake_mask);
+	ClassDB::bind_method(D_METHOD("set_bake_mask_bit", "layer", "enabled"), &OccluderInstance3D::set_bake_mask_bit);
+	ClassDB::bind_method(D_METHOD("get_bake_mask_bit", "layer"), &OccluderInstance3D::get_bake_mask_bit);
+
+	ClassDB::bind_method(D_METHOD("set_occluder", "occluder"), &OccluderInstance3D::set_occluder);
+	ClassDB::bind_method(D_METHOD("get_occluder"), &OccluderInstance3D::get_occluder);
+
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "occluder", PROPERTY_HINT_RESOURCE_TYPE, "Occluder3D"), "set_occluder", "get_occluder");
+	ADD_GROUP("Bake", "bake_");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_mask", PROPERTY_HINT_LAYERS_3D_RENDER), "set_bake_mask", "get_bake_mask");
+}
+
+OccluderInstance3D::OccluderInstance3D() {
+}
+
+OccluderInstance3D::~OccluderInstance3D() {
+}
diff --git a/scene/3d/occluder_instance_3d.h b/scene/3d/occluder_instance_3d.h
new file mode 100644
index 0000000000..4bb468274d
--- /dev/null
+++ b/scene/3d/occluder_instance_3d.h
@@ -0,0 +1,108 @@
+/*************************************************************************/
+/*  occluder_instance_3d.h                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_H
+#define OCCLUDER_INSTANCE_3D_H
+
+#include "scene/3d/visual_instance_3d.h"
+
+class Occluder3D : public Resource {
+	GDCLASS(Occluder3D, Resource);
+	RES_BASE_EXTENSION("occ");
+
+	mutable RID occluder;
+	mutable Ref<ArrayMesh> debug_mesh;
+	mutable Vector<Vector3> debug_lines;
+	AABB aabb;
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	void _update_changes();
+
+protected:
+	static void _bind_methods();
+
+public:
+	void set_vertices(PackedVector3Array p_vertices);
+	PackedVector3Array get_vertices() const;
+
+	void set_indices(PackedInt32Array p_indices);
+	PackedInt32Array get_indices() const;
+
+	Vector<Vector3> get_debug_lines() const;
+	Ref<ArrayMesh> get_debug_mesh() const;
+	AABB get_aabb() const;
+
+	virtual RID get_rid() const override;
+	Occluder3D();
+	~Occluder3D();
+};
+
+class OccluderInstance3D : public VisualInstance3D {
+	GDCLASS(OccluderInstance3D, Node3D);
+
+private:
+	Ref<Occluder3D> occluder;
+	uint32_t bake_mask = 0xFFFFFFFF;
+
+	void _occluder_changed();
+
+	bool _bake_material_check(Ref<Material> p_material);
+	void _bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices);
+
+protected:
+	static void _bind_methods();
+
+public:
+	enum BakeError {
+		BAKE_ERROR_OK,
+		BAKE_ERROR_NO_SAVE_PATH,
+		BAKE_ERROR_NO_MESHES,
+	};
+
+	void set_occluder(const Ref<Occluder3D> &p_occluder);
+	Ref<Occluder3D> get_occluder() const;
+
+	virtual AABB get_aabb() const override;
+	virtual Vector<Face3> get_faces(uint32_t p_usage_flags) const override;
+
+	void set_bake_mask(uint32_t p_mask);
+	uint32_t get_bake_mask() const;
+
+	void set_bake_mask_bit(int p_layer, bool p_enable);
+	bool get_bake_mask_bit(int p_layer) const;
+	BakeError bake(Node *p_from_node, String p_occluder_path = "");
+
+	OccluderInstance3D();
+	~OccluderInstance3D();
+};
+
+#endif
diff --git a/scene/3d/soft_body_3d.cpp b/scene/3d/soft_body_3d.cpp
index 98ac6aa65e..9a7984b06a 100644
--- a/scene/3d/soft_body_3d.cpp
+++ b/scene/3d/soft_body_3d.cpp
@@ -85,11 +85,11 @@ void SoftBodyRenderingServerHandler::commit_changes() {
 }
 
 void SoftBodyRenderingServerHandler::set_vertex(int p_vertex_id, const void *p_vector3) {
-	copymem(&write_buffer[p_vertex_id * stride + offset_vertices], p_vector3, sizeof(float) * 3);
+	memcpy(&write_buffer[p_vertex_id * stride + offset_vertices], p_vector3, sizeof(float) * 3);
 }
 
 void SoftBodyRenderingServerHandler::set_normal(int p_vertex_id, const void *p_vector3) {
-	copymem(&write_buffer[p_vertex_id * stride + offset_normal], p_vector3, sizeof(float) * 3);
+	memcpy(&write_buffer[p_vertex_id * stride + offset_normal], p_vector3, sizeof(float) * 3);
 }
 
 void SoftBodyRenderingServerHandler::set_aabb(const AABB &p_aabb) {
diff --git a/scene/3d/visual_instance_3d.cpp b/scene/3d/visual_instance_3d.cpp
index 394c67e873..d81b09b86c 100644
--- a/scene/3d/visual_instance_3d.cpp
+++ b/scene/3d/visual_instance_3d.cpp
@@ -338,6 +338,15 @@ GeometryInstance3D::GIMode GeometryInstance3D::get_gi_mode() const {
 	return gi_mode;
 }
 
+void GeometryInstance3D::set_ignore_occlusion_culling(bool p_enabled) {
+	ignore_occlusion_culling = p_enabled;
+	RS::get_singleton()->instance_geometry_set_flag(get_instance(), RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, ignore_occlusion_culling);
+}
+
+bool GeometryInstance3D::is_ignoring_occlusion_culling() {
+	return ignore_occlusion_culling;
+}
+
 void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_material_override", "material"), &GeometryInstance3D::set_material_override);
 	ClassDB::bind_method(D_METHOD("get_material_override"), &GeometryInstance3D::get_material_override);
@@ -345,21 +354,24 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_cast_shadows_setting", "shadow_casting_setting"), &GeometryInstance3D::set_cast_shadows_setting);
 	ClassDB::bind_method(D_METHOD("get_cast_shadows_setting"), &GeometryInstance3D::get_cast_shadows_setting);
 
+	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
+	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+
 	ClassDB::bind_method(D_METHOD("set_lod_max_hysteresis", "mode"), &GeometryInstance3D::set_lod_max_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_max_hysteresis"), &GeometryInstance3D::get_lod_max_hysteresis);
 
 	ClassDB::bind_method(D_METHOD("set_lod_max_distance", "mode"), &GeometryInstance3D::set_lod_max_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_max_distance"), &GeometryInstance3D::get_lod_max_distance);
 
-	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
-	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
-
 	ClassDB::bind_method(D_METHOD("set_lod_min_hysteresis", "mode"), &GeometryInstance3D::set_lod_min_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_min_hysteresis"), &GeometryInstance3D::get_lod_min_hysteresis);
 
 	ClassDB::bind_method(D_METHOD("set_lod_min_distance", "mode"), &GeometryInstance3D::set_lod_min_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_min_distance"), &GeometryInstance3D::get_lod_min_distance);
 
+	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
+	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
+
 	ClassDB::bind_method(D_METHOD("set_extra_cull_margin", "margin"), &GeometryInstance3D::set_extra_cull_margin);
 	ClassDB::bind_method(D_METHOD("get_extra_cull_margin"), &GeometryInstance3D::get_extra_cull_margin);
 
@@ -369,8 +381,8 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_gi_mode", "mode"), &GeometryInstance3D::set_gi_mode);
 	ClassDB::bind_method(D_METHOD("get_gi_mode"), &GeometryInstance3D::get_gi_mode);
 
-	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
-	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+	ClassDB::bind_method(D_METHOD("set_ignore_occlusion_culling", "ignore_culling"), &GeometryInstance3D::set_ignore_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_ignoring_occlusion_culling"), &GeometryInstance3D::is_ignoring_occlusion_culling);
 
 	ClassDB::bind_method(D_METHOD("set_custom_aabb", "aabb"), &GeometryInstance3D::set_custom_aabb);
 
@@ -381,6 +393,7 @@ void GeometryInstance3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "cast_shadow", PROPERTY_HINT_ENUM, "Off,On,Double-Sided,Shadows Only"), "set_cast_shadows_setting", "get_cast_shadows_setting");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "extra_cull_margin", PROPERTY_HINT_RANGE, "0,16384,0.01"), "set_extra_cull_margin", "get_extra_cull_margin");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_bias", PROPERTY_HINT_RANGE, "0.001,128,0.001"), "set_lod_bias", "get_lod_bias");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ignore_occlusion_culling"), "set_ignore_occlusion_culling", "is_ignoring_occlusion_culling");
 	ADD_GROUP("Global Illumination", "gi_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_mode", PROPERTY_HINT_ENUM, "Disabled,Baked,Dynamic"), "set_gi_mode", "get_gi_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_lightmap_scale", PROPERTY_HINT_ENUM, "1x,2x,4x,8x"), "set_lightmap_scale", "get_lightmap_scale");
diff --git a/scene/3d/visual_instance_3d.h b/scene/3d/visual_instance_3d.h
index 7fed8095ef..68d29ef81e 100644
--- a/scene/3d/visual_instance_3d.h
+++ b/scene/3d/visual_instance_3d.h
@@ -120,6 +120,7 @@ private:
 	float extra_cull_margin = 0.0;
 	LightmapScale lightmap_scale = LIGHTMAP_SCALE_1X;
 	GIMode gi_mode = GI_MODE_DISABLED;
+	bool ignore_occlusion_culling = false;
 
 	const StringName *_instance_uniform_get_remap(const StringName p_name) const;
 
@@ -167,6 +168,9 @@ public:
 
 	void set_custom_aabb(AABB aabb);
 
+	void set_ignore_occlusion_culling(bool p_enabled);
+	bool is_ignoring_occlusion_culling();
+
 	GeometryInstance3D();
 };
 
diff --git a/scene/gui/line_edit.cpp b/scene/gui/line_edit.cpp
index 124a07fa65..1aff5d5390 100644
--- a/scene/gui/line_edit.cpp
+++ b/scene/gui/line_edit.cpp
@@ -155,6 +155,7 @@ void LineEdit::_backspace(bool p_word, bool p_all_to_left) {
 		for (int i = words.size() - 1; i >= 0; i--) {
 			if (words[i].x < cc) {
 				cc = words[i].x;
+				break;
 			}
 		}
 
@@ -202,6 +203,7 @@ void LineEdit::_delete(bool p_word, bool p_all_to_right) {
 		}
 
 		delete_text(caret_column, cc);
+		set_caret_column(caret_column);
 	} else {
 		if (caret_mid_grapheme_enabled) {
 			set_caret_column(caret_column + 1);
diff --git a/scene/main/http_request.cpp b/scene/main/http_request.cpp
index 64df37654b..08ab71e7fa 100644
--- a/scene/main/http_request.cpp
+++ b/scene/main/http_request.cpp
@@ -123,7 +123,7 @@ Error HTTPRequest::request(const String &p_url, const Vector<String> &p_custom_h
 	size_t len = charstr.length();
 	raw_data.resize(len);
 	uint8_t *w = raw_data.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return request_raw(p_url, p_custom_headers, p_ssl_validate_domain, p_method, raw_data);
 }
diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp
index a62c4ff770..387af3703b 100644
--- a/scene/main/scene_tree.cpp
+++ b/scene/main/scene_tree.cpp
@@ -1378,6 +1378,9 @@ SceneTree::SceneTree() {
 	const bool use_debanding = GLOBAL_DEF("rendering/anti_aliasing/quality/use_debanding", false);
 	root->set_use_debanding(use_debanding);
 
+	const bool use_occlusion_culling = GLOBAL_DEF("rendering/occlusion_culling/use_occlusion_culling", false);
+	root->set_use_occlusion_culling(use_occlusion_culling);
+
 	float lod_threshold = GLOBAL_DEF("rendering/mesh_lod/lod_change/threshold_pixels", 1.0);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/mesh_lod/lod_change/threshold_pixels", PropertyInfo(Variant::FLOAT, "rendering/mesh_lod/lod_change/threshold_pixels", PROPERTY_HINT_RANGE, "0,1024,0.1"));
 	root->set_lod_threshold(lod_threshold);
diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp
index 4c9ebe016e..f1613f2fe5 100644
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@@ -3242,6 +3242,21 @@ float Viewport::get_lod_threshold() const {
 	return lod_threshold;
 }
 
+void Viewport::set_use_occlusion_culling(bool p_use_occlusion_culling) {
+	if (use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+
+	use_occlusion_culling = p_use_occlusion_culling;
+	RS::get_singleton()->viewport_set_use_occlusion_culling(viewport, p_use_occlusion_culling);
+
+	notify_property_list_changed();
+}
+
+bool Viewport::is_using_occlusion_culling() const {
+	return use_occlusion_culling;
+}
+
 void Viewport::set_debug_draw(DebugDraw p_debug_draw) {
 	debug_draw = p_debug_draw;
 	RS::get_singleton()->viewport_set_debug_draw(viewport, RS::ViewportDebugDraw(p_debug_draw));
@@ -3331,9 +3346,6 @@ bool Viewport::is_handling_input_locally() const {
 	return handle_input_locally;
 }
 
-void Viewport::_validate_property(PropertyInfo &property) const {
-}
-
 void Viewport::set_default_canvas_item_texture_filter(DefaultCanvasItemTextureFilter p_filter) {
 	ERR_FAIL_INDEX(p_filter, DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_MAX);
 
@@ -3478,6 +3490,9 @@ void Viewport::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_use_debanding", "enable"), &Viewport::set_use_debanding);
 	ClassDB::bind_method(D_METHOD("is_using_debanding"), &Viewport::is_using_debanding);
 
+	ClassDB::bind_method(D_METHOD("set_use_occlusion_culling", "enable"), &Viewport::set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_using_occlusion_culling"), &Viewport::is_using_occlusion_culling);
+
 	ClassDB::bind_method(D_METHOD("set_debug_draw", "debug_draw"), &Viewport::set_debug_draw);
 	ClassDB::bind_method(D_METHOD("get_debug_draw"), &Viewport::get_debug_draw);
 
@@ -3574,6 +3589,7 @@ void Viewport::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "msaa", PROPERTY_HINT_ENUM, "Disabled,2x,4x,8x,16x,AndroidVR 2x,AndroidVR 4x"), "set_msaa", "get_msaa");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "screen_space_aa", PROPERTY_HINT_ENUM, "Disabled,FXAA"), "set_screen_space_aa", "get_screen_space_aa");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_debanding"), "set_use_debanding", "is_using_debanding");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_occlusion_culling"), "set_use_occlusion_culling", "is_using_occlusion_culling");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_threshold", PROPERTY_HINT_RANGE, "0,1024,0.1"), "set_lod_threshold", "get_lod_threshold");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "debug_draw", PROPERTY_HINT_ENUM, "Disabled,Unshaded,Overdraw,Wireframe"), "set_debug_draw", "get_debug_draw");
 	ADD_GROUP("Canvas Items", "canvas_item_");
@@ -3655,6 +3671,7 @@ void Viewport::_bind_methods() {
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_SPOT_LIGHTS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_DECALS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_REFLECTION_PROBES);
+	BIND_ENUM_CONSTANT(DEBUG_DRAW_OCCLUDERS)
 
 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST);
 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_LINEAR);
diff --git a/scene/main/viewport.h b/scene/main/viewport.h
index e8a88debf1..6786b70a6b 100644
--- a/scene/main/viewport.h
+++ b/scene/main/viewport.h
@@ -147,6 +147,7 @@ public:
 		DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		DEBUG_DRAW_CLUSTER_DECALS,
 		DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		DEBUG_DRAW_OCCLUDERS,
 	};
 
 	enum DefaultCanvasItemTextureFilter {
@@ -304,6 +305,7 @@ private:
 	ScreenSpaceAA screen_space_aa = SCREEN_SPACE_AA_DISABLED;
 	bool use_debanding = false;
 	float lod_threshold = 1.0;
+	bool use_occlusion_culling = false;
 
 	Ref<ViewportTexture> default_texture;
 	Set<ViewportTexture *> viewport_textures;
@@ -480,7 +482,6 @@ protected:
 	void _notification(int p_what);
 	void _process_picking();
 	static void _bind_methods();
-	virtual void _validate_property(PropertyInfo &property) const override;
 
 public:
 	uint64_t get_processed_events_count() const { return event_count; }
@@ -556,6 +557,9 @@ public:
 	void set_lod_threshold(float p_pixels);
 	float get_lod_threshold() const;
 
+	void set_use_occlusion_culling(bool p_us_occlusion_culling);
+	bool is_using_occlusion_culling() const;
+
 	Vector2 get_camera_coords(const Vector2 &p_viewport_coords) const;
 	Vector2 get_camera_rect_size() const;
 
diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp
index 232ad278dd..1b3be13039 100644
--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@@ -208,6 +208,7 @@
 #include "scene/3d/navigation_agent_3d.h"
 #include "scene/3d/navigation_obstacle_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/path_3d.h"
 #include "scene/3d/physics_body_3d.h"
 #include "scene/3d/physics_joint_3d.h"
@@ -442,6 +443,8 @@ void register_scene_types() {
 	ClassDB::register_class<XRAnchor3D>();
 	ClassDB::register_class<XROrigin3D>();
 	ClassDB::register_class<MeshInstance3D>();
+	ClassDB::register_class<OccluderInstance3D>();
+	ClassDB::register_class<Occluder3D>();
 	ClassDB::register_class<ImmediateGeometry3D>();
 	ClassDB::register_virtual_class<SpriteBase3D>();
 	ClassDB::register_class<Sprite3D>();
diff --git a/scene/resources/audio_stream_sample.cpp b/scene/resources/audio_stream_sample.cpp
index 06a91fb2f8..9a9f019dda 100644
--- a/scene/resources/audio_stream_sample.cpp
+++ b/scene/resources/audio_stream_sample.cpp
@@ -490,9 +490,9 @@ void AudioStreamSample::set_data(const Vector<uint8_t> &p_data) {
 		const uint8_t *r = p_data.ptr();
 		int alloc_len = datalen + DATA_PAD * 2;
 		data = memalloc(alloc_len); //alloc with some padding for interpolation
-		zeromem(data, alloc_len);
+		memset(data, 0, alloc_len);
 		uint8_t *dataptr = (uint8_t *)data;
-		copymem(dataptr + DATA_PAD, r, datalen);
+		memcpy(dataptr + DATA_PAD, r, datalen);
 		data_bytes = datalen;
 	}
 
@@ -507,7 +507,7 @@ Vector<uint8_t> AudioStreamSample::get_data() const {
 		{
 			uint8_t *w = pv.ptrw();
 			uint8_t *dataptr = (uint8_t *)data;
-			copymem(w, dataptr + DATA_PAD, data_bytes);
+			memcpy(w, dataptr + DATA_PAD, data_bytes);
 		}
 	}
 
diff --git a/scene/resources/bit_map.cpp b/scene/resources/bit_map.cpp
index 3cc1af59ae..e9bfac3653 100644
--- a/scene/resources/bit_map.cpp
+++ b/scene/resources/bit_map.cpp
@@ -39,7 +39,7 @@ void BitMap::create(const Size2 &p_size) {
 	width = p_size.width;
 	height = p_size.height;
 	bitmask.resize(((width * height) / 8) + 1);
-	zeromem(bitmask.ptrw(), bitmask.size());
+	memset(bitmask.ptrw(), 0, bitmask.size());
 }
 
 void BitMap::create_from_image_alpha(const Ref<Image> &p_image, float p_threshold) {
diff --git a/scene/resources/default_theme/default_theme.cpp b/scene/resources/default_theme/default_theme.cpp
index b9f4a7a741..7c00c6d146 100644
--- a/scene/resources/default_theme/default_theme.cpp
+++ b/scene/resources/default_theme/default_theme.cpp
@@ -830,7 +830,6 @@ void fill_default_theme(Ref<Theme> &theme, const Ref<Font> &default_font, const
 	theme->set_stylebox("tab_selected", "Tabs", sb_expand(make_stylebox(tab_current_png, 4, 3, 4, 1, 16, 3, 16, 2), 2, 2, 2, 2));
 	theme->set_stylebox("tab_unselected", "Tabs", sb_expand(make_stylebox(tab_behind_png, 5, 4, 5, 1, 16, 5, 16, 2), 3, 3, 3, 3));
 	theme->set_stylebox("tab_disabled", "Tabs", sb_expand(make_stylebox(tab_disabled_png, 5, 5, 5, 1, 16, 6, 16, 4), 3, 0, 3, 3));
-	theme->set_stylebox("panel", "Tabs", tc_sb);
 	theme->set_stylebox("button_pressed", "Tabs", make_stylebox(button_pressed_png, 4, 4, 4, 4));
 	theme->set_stylebox("button", "Tabs", make_stylebox(button_normal_png, 4, 4, 4, 4));
 
diff --git a/scene/resources/particles_material.cpp b/scene/resources/particles_material.cpp
index bb47eebe9b..59e699326d 100644
--- a/scene/resources/particles_material.cpp
+++ b/scene/resources/particles_material.cpp
@@ -615,7 +615,7 @@ void ParticlesMaterial::_update_shader() {
 		}
 		// turn particle by rotation in Y
 		if (particle_flags[PARTICLE_FLAG_ROTATE_Y]) {
-			code += "	TRANSFORM = TRANSFORM * mat4(vec4(cos(CUSTOM.x), 0.0, -sin(CUSTOM.x), 0.0), vec4(0.0, 1.0, 0.0, 0.0), vec4(sin(CUSTOM.x), 0.0, cos(CUSTOM.x), 0.0), vec4(0.0, 0.0, 0.0, 1.0));\n";
+			code += "	TRANSFORM = mat4(vec4(cos(CUSTOM.x), 0.0, -sin(CUSTOM.x), 0.0), vec4(0.0, 1.0, 0.0, 0.0), vec4(sin(CUSTOM.x), 0.0, cos(CUSTOM.x), 0.0), vec4(0.0, 0.0, 0.0, 1.0));\n";
 		}
 	}
 	//scale by scale
diff --git a/scene/resources/surface_tool.cpp b/scene/resources/surface_tool.cpp
index 3d3900ecc5..c30bd7927d 100644
--- a/scene/resources/surface_tool.cpp
+++ b/scene/resources/surface_tool.cpp
@@ -1105,7 +1105,7 @@ void SurfaceTool::optimize_indices_for_cache() {
 	ERR_FAIL_COND(index_array.size() == 0);
 
 	LocalVector old_index_array = index_array;
-	zeromem(index_array.ptr(), index_array.size() * sizeof(int));
+	memset(index_array.ptr(), 0, index_array.size() * sizeof(int));
 	optimize_vertex_cache_func((unsigned int *)index_array.ptr(), (unsigned int *)old_index_array.ptr(), old_index_array.size(), vertex_array.size());
 }
 
diff --git a/scene/resources/texture.cpp b/scene/resources/texture.cpp
index b6a2f24b8b..771365152d 100644
--- a/scene/resources/texture.cpp
+++ b/scene/resources/texture.cpp
@@ -410,7 +410,7 @@ Ref<Image> StreamTexture2D::load_image_from_file(FileAccess *f, int p_size_limit
 					Vector<uint8_t> id = mipmap_images[i]->get_data();
 					int len = id.size();
 					const uint8_t *r = id.ptr();
-					copymem(&wr[ofs], r, len);
+					memcpy(&wr[ofs], r, len);
 					ofs += len;
 				}
 			}
diff --git a/scene/resources/world_3d.cpp b/scene/resources/world_3d.cpp
index f067771d58..e811cbf57a 100644
--- a/scene/resources/world_3d.cpp
+++ b/scene/resources/world_3d.cpp
@@ -321,7 +321,7 @@ void World3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_environment"), &World3D::get_environment);
 	ClassDB::bind_method(D_METHOD("set_fallback_environment", "env"), &World3D::set_fallback_environment);
 	ClassDB::bind_method(D_METHOD("get_fallback_environment"), &World3D::get_fallback_environment);
-	ClassDB::bind_method(D_METHOD("set_camera_effects", "env"), &World3D::set_camera_effects);
+	ClassDB::bind_method(D_METHOD("set_camera_effects", "effects"), &World3D::set_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_camera_effects"), &World3D::get_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_direct_space_state"), &World3D::get_direct_space_state);
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "environment", PROPERTY_HINT_RESOURCE_TYPE, "Environment"), "set_environment", "get_environment");
diff --git a/servers/rendering/renderer_rd/cluster_builder_rd.cpp b/servers/rendering/renderer_rd/cluster_builder_rd.cpp
index 0fdd864d47..2669a73014 100644
--- a/servers/rendering/renderer_rd/cluster_builder_rd.cpp
+++ b/servers/rendering/renderer_rd/cluster_builder_rd.cpp
@@ -86,13 +86,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * icosphere_vertex_count * 3);
-		copymem(vertex_data.ptrw(), icosphere_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), icosphere_vertices, vertex_data.size());
 
 		sphere_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * icosphere_triangle_count * 3);
-		copymem(index_data.ptrw(), icosphere_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), icosphere_triangle_indices, index_data.size());
 
 		sphere_index_buffer = RD::get_singleton()->index_buffer_create(icosphere_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
@@ -130,13 +130,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * cone_vertex_count * 3);
-		copymem(vertex_data.ptrw(), cone_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), cone_vertices, vertex_data.size());
 
 		cone_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * cone_triangle_count * 3);
-		copymem(index_data.ptrw(), cone_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), cone_triangle_indices, index_data.size());
 
 		cone_index_buffer = RD::get_singleton()->index_buffer_create(cone_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
@@ -184,13 +184,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * box_vertex_count * 3);
-		copymem(vertex_data.ptrw(), box_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), box_vertices, vertex_data.size());
 
 		box_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * box_triangle_count * 3);
-		copymem(index_data.ptrw(), box_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), box_triangle_indices, index_data.size());
 
 		box_index_buffer = RD::get_singleton()->index_buffer_create(box_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
diff --git a/servers/rendering/renderer_rd/effects_rd.cpp b/servers/rendering/renderer_rd/effects_rd.cpp
index bc304aedd8..563e08fdcb 100644
--- a/servers/rendering/renderer_rd/effects_rd.cpp
+++ b/servers/rendering/renderer_rd/effects_rd.cpp
@@ -226,7 +226,7 @@ RID EffectsRD::_get_compute_uniform_set_from_image_pair(RID p_texture1, RID p_te
 }
 
 void EffectsRD::copy_to_atlas_fb(RID p_source_rd_texture, RID p_dest_framebuffer, const Rect2 &p_uv_rect, RD::DrawListID p_draw_list, bool p_flip_y, bool p_panorama) {
-	zeromem(&copy_to_fb.push_constant, sizeof(CopyToFbPushConstant));
+	memset(&copy_to_fb.push_constant, 0, sizeof(CopyToFbPushConstant));
 
 	copy_to_fb.push_constant.use_section = true;
 	copy_to_fb.push_constant.section[0] = p_uv_rect.position.x;
@@ -247,7 +247,7 @@ void EffectsRD::copy_to_atlas_fb(RID p_source_rd_texture, RID p_dest_framebuffer
 }
 
 void EffectsRD::copy_to_fb_rect(RID p_source_rd_texture, RID p_dest_framebuffer, const Rect2i &p_rect, bool p_flip_y, bool p_force_luminance, bool p_alpha_to_zero, bool p_srgb, RID p_secondary) {
-	zeromem(&copy_to_fb.push_constant, sizeof(CopyToFbPushConstant));
+	memset(&copy_to_fb.push_constant, 0, sizeof(CopyToFbPushConstant));
 
 	if (p_flip_y) {
 		copy_to_fb.push_constant.flip_y = true;
@@ -275,7 +275,7 @@ void EffectsRD::copy_to_fb_rect(RID p_source_rd_texture, RID p_dest_framebuffer,
 }
 
 void EffectsRD::copy_to_rect(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y, bool p_force_luminance, bool p_all_source, bool p_8_bit_dst, bool p_alpha_to_one) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -309,7 +309,7 @@ void EffectsRD::copy_to_rect(RID p_source_rd_texture, RID p_dest_texture, const
 }
 
 void EffectsRD::copy_cubemap_to_panorama(RID p_source_cube, RID p_dest_panorama, const Size2i &p_panorama_size, float p_lod, bool p_is_array) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -329,7 +329,7 @@ void EffectsRD::copy_cubemap_to_panorama(RID p_source_cube, RID p_dest_panorama,
 }
 
 void EffectsRD::copy_depth_to_rect_and_linearize(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y, float p_z_near, float p_z_far) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -353,7 +353,7 @@ void EffectsRD::copy_depth_to_rect_and_linearize(RID p_source_rd_texture, RID p_
 }
 
 void EffectsRD::copy_depth_to_rect(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -375,7 +375,7 @@ void EffectsRD::copy_depth_to_rect(RID p_source_rd_texture, RID p_dest_texture,
 }
 
 void EffectsRD::set_color(RID p_dest_texture, const Color &p_color, const Rect2i &p_region, bool p_8bit_dst) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -397,7 +397,7 @@ void EffectsRD::set_color(RID p_dest_texture, const Color &p_color, const Rect2i
 }
 
 void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back_texture, const Rect2i &p_region, bool p_8bit_dst) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	uint32_t base_flags = 0;
 	copy.push_constant.section[0] = p_region.position.x;
@@ -430,7 +430,7 @@ void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back
 }
 
 void EffectsRD::gaussian_glow(RID p_source_rd_texture, RID p_back_texture, const Size2i &p_size, float p_strength, bool p_high_quality, bool p_first_pass, float p_luminance_cap, float p_exposure, float p_bloom, float p_hdr_bleed_treshold, float p_hdr_bleed_scale, RID p_auto_exposure, float p_auto_exposure_grey) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	CopyMode copy_mode = p_first_pass && p_auto_exposure.is_valid() ? COPY_MODE_GAUSSIAN_GLOW_AUTO_EXPOSURE : COPY_MODE_GAUSSIAN_GLOW;
 	uint32_t base_flags = 0;
@@ -657,7 +657,7 @@ void EffectsRD::merge_specular(RID p_dest_framebuffer, RID p_specular, RID p_bas
 }
 
 void EffectsRD::make_mipmap(RID p_source_rd_texture, RID p_dest_texture, const Size2i &p_size) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -694,7 +694,7 @@ void EffectsRD::copy_cubemap_to_dp(RID p_source_rd_texture, RID p_dst_framebuffe
 }
 
 void EffectsRD::tonemapper(RID p_source_color, RID p_dst_framebuffer, const TonemapSettings &p_settings) {
-	zeromem(&tonemap.push_constant, sizeof(TonemapPushConstant));
+	memset(&tonemap.push_constant, 0, sizeof(TonemapPushConstant));
 
 	tonemap.push_constant.use_bcs = p_settings.use_bcs;
 	tonemap.push_constant.bcs[0] = p_settings.brightness;
@@ -1294,7 +1294,7 @@ void EffectsRD::roughness_limit(RID p_source_normal, RID p_roughness, const Size
 }
 
 void EffectsRD::cubemap_roughness(RID p_source_rd_texture, RID p_dest_framebuffer, uint32_t p_face_id, uint32_t p_sample_count, float p_roughness, float p_size) {
-	zeromem(&roughness.push_constant, sizeof(CubemapRoughnessPushConstant));
+	memset(&roughness.push_constant, 0, sizeof(CubemapRoughnessPushConstant));
 
 	roughness.push_constant.face_id = p_face_id > 9 ? 0 : p_face_id;
 	roughness.push_constant.roughness = p_roughness;
@@ -1368,7 +1368,7 @@ void EffectsRD::cubemap_filter(RID p_source_cubemap, Vector<RID> p_dest_cubemap,
 void EffectsRD::render_sky(RD::DrawListID p_list, float p_time, RID p_fb, RID p_samplers, RID p_fog, PipelineCacheRD *p_pipeline, RID p_uniform_set, RID p_texture_set, const CameraMatrix &p_camera, const Basis &p_orientation, float p_multiplier, const Vector3 &p_position) {
 	SkyPushConstant sky_push_constant;
 
-	zeromem(&sky_push_constant, sizeof(SkyPushConstant));
+	memset(&sky_push_constant, 0, sizeof(SkyPushConstant));
 
 	sky_push_constant.proj[0] = p_camera.matrix[2][0];
 	sky_push_constant.proj[1] = p_camera.matrix[0][0];
@@ -1510,7 +1510,7 @@ EffectsRD::EffectsRD() {
 		copy_modes.push_back("\n#define MODE_CUBEMAP_ARRAY_TO_PANORAMA\n");
 
 		copy.shader.initialize(copy_modes);
-		zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+		memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 		copy.shader_version = copy.shader.version_create();
 
 		for (int i = 0; i < COPY_MODE_MAX; i++) {
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
index 9be0dc0d15..2c63eed4f2 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
@@ -2756,7 +2756,7 @@ void RenderForwardClustered::geometry_instance_set_lightmap_capture(GeometryInst
 			ginstance->lightmap_sh = geometry_instance_lightmap_sh.alloc();
 		}
 
-		copymem(ginstance->lightmap_sh->sh, p_sh9, sizeof(Color) * 9);
+		memcpy(ginstance->lightmap_sh->sh, p_sh9, sizeof(Color) * 9);
 	} else {
 		if (ginstance->lightmap_sh != nullptr) {
 			geometry_instance_lightmap_sh.free(ginstance->lightmap_sh);
diff --git a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
index 3c76c91a67..377b0fd72d 100644
--- a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
@@ -304,7 +304,7 @@ RendererCanvasRender::PolygonID RendererCanvasRenderRD::request_polygon(const Ve
 		index_buffer.resize(p_indices.size() * sizeof(int32_t));
 		{
 			uint8_t *w = index_buffer.ptrw();
-			copymem(w, p_indices.ptr(), sizeof(int32_t) * p_indices.size());
+			memcpy(w, p_indices.ptr(), sizeof(int32_t) * p_indices.size());
 		}
 		pb.index_buffer = RD::get_singleton()->index_buffer_create(p_indices.size(), RD::INDEX_BUFFER_FORMAT_UINT32, index_buffer);
 		pb.indices = RD::get_singleton()->index_array_create(pb.index_buffer, 0, p_indices.size());
diff --git a/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
index 3856f38457..2b0e93f734 100644
--- a/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
@@ -1564,7 +1564,7 @@ void RendererSceneGIRD::SDFGI::render_region(RID p_render_buffers, int p_region,
 		//clear dispatch indirect data
 
 		SDFGIShader::PreprocessPushConstant push_constant;
-		zeromem(&push_constant, sizeof(SDFGIShader::PreprocessPushConstant));
+		memset(&push_constant, 0, sizeof(SDFGIShader::PreprocessPushConstant));
 
 		RENDER_TIMESTAMP("Scroll SDF");
 
@@ -2602,7 +2602,7 @@ void RendererSceneGIRD::GIProbeInstance::update(bool p_update_light_instances, c
 				p_scene_render->_render_material(to_world_xform * xform, cm, true, p_scene_render->cull_argument, dynamic_maps[0].fb, Rect2i(Vector2i(), rect.size));
 
 				GIProbeDynamicPushConstant push_constant;
-				zeromem(&push_constant, sizeof(GIProbeDynamicPushConstant));
+				memset(&push_constant, 0, sizeof(GIProbeDynamicPushConstant));
 				push_constant.limits[0] = octree_size.x;
 				push_constant.limits[1] = octree_size.y;
 				push_constant.limits[2] = octree_size.z;
diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
index a98f67f163..a742c4cc28 100644
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
@@ -1873,7 +1873,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 	storage->render_target_disable_clear_request(rb->render_target);
 }
 
-void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas) {
+void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer) {
 	EffectsRD *effects = storage->get_effects();
 
 	RenderBuffers *rb = render_buffers_owner.getornull(p_render_buffers);
@@ -1932,6 +1932,13 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID
 		RID reflection_texture = rb->reflection_buffer;
 		effects->copy_to_fb_rect(ambient_texture, storage->render_target_get_rd_framebuffer(rb->render_target), Rect2(Vector2(), rtsize), false, false, false, true, reflection_texture);
 	}
+
+	if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		if (p_occlusion_buffer.is_valid()) {
+			Size2 rtsize = storage->render_target_get_size(rb->render_target);
+			effects->copy_to_fb_rect(storage->texture_get_rd_texture(p_occlusion_buffer), storage->render_target_get_rd_framebuffer(rb->render_target), Rect2i(Vector2(), rtsize), true, false);
+		}
+	}
 }
 
 void RendererSceneRenderRD::environment_set_adjustment(RID p_env, bool p_enable, float p_brightness, float p_contrast, float p_saturation, bool p_use_1d_color_correction, RID p_color_correction) {
@@ -3516,7 +3523,7 @@ void RendererSceneRenderRD::_pre_opaque_render(bool p_use_ssao, bool p_use_gi, R
 	}
 }
 
-void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
+void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
 	// getting this here now so we can direct call a bunch of things more easily
 	RenderBuffers *rb = nullptr;
 	if (p_render_buffers.is_valid()) {
@@ -3643,7 +3650,7 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &
 		RENDER_TIMESTAMP("Tonemap");
 
 		_render_buffers_post_process_and_tonemap(p_render_buffers, p_environment, p_camera_effects, p_cam_projection);
-		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas);
+		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas, p_occluder_debug_tex);
 		if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_SDFGI && rb != nullptr && rb->sdfgi != nullptr) {
 			rb->sdfgi->debug_draw(p_cam_projection, p_cam_transform, rb->width, rb->height, rb->render_target, rb->texture);
 		}
diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.h b/servers/rendering/renderer_rd/renderer_scene_render_rd.h
index 4bf0818206..8c01b69b91 100644
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.h
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.h
@@ -441,7 +441,7 @@ private:
 	void _allocate_blur_textures(RenderBuffers *rb);
 	void _allocate_luminance_textures(RenderBuffers *rb);
 
-	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas);
+	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer);
 	void _render_buffers_post_process_and_tonemap(RID p_render_buffers, RID p_environment, RID p_camera_effects, const CameraMatrix &p_projection);
 
 	/* Cluster */
@@ -1125,7 +1125,7 @@ public:
 	float render_buffers_get_volumetric_fog_end(RID p_render_buffers);
 	float render_buffers_get_volumetric_fog_detail_spread(RID p_render_buffers);
 
-	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
+	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
 
 	void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
 
diff --git a/servers/rendering/renderer_rd/renderer_storage_rd.cpp b/servers/rendering/renderer_rd/renderer_storage_rd.cpp
index 540b32481b..250b694f74 100644
--- a/servers/rendering/renderer_rd/renderer_storage_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_storage_rd.cpp
@@ -756,7 +756,7 @@ void RendererStorageRD::texture_3d_initialize(RID p_texture, Image::Format p_for
 		for (int i = 0; i < p_data.size(); i++) {
 			uint32_t s = images[i]->get_data().size();
 
-			copymem(&all_data.write[offset], images[i]->get_data().ptr(), s);
+			memcpy(&all_data.write[offset], images[i]->get_data().ptr(), s);
 			{
 				Texture::BufferSlice3D slice;
 				slice.size.width = images[i]->get_width();
@@ -919,7 +919,7 @@ void RendererStorageRD::texture_3d_update(RID p_texture, const Vector<Ref<Image>
 
 		for (int i = 0; i < p_data.size(); i++) {
 			uint32_t s = images[i]->get_data().size();
-			copymem(&all_data.write[offset], images[i]->get_data().ptr(), s);
+			memcpy(&all_data.write[offset], images[i]->get_data().ptr(), s);
 			offset += s;
 		}
 	}
@@ -2108,13 +2108,13 @@ _FORCE_INLINE_ static void _fill_std140_ubo_empty(ShaderLanguage::DataType type,
 		case ShaderLanguage::TYPE_INT:
 		case ShaderLanguage::TYPE_UINT:
 		case ShaderLanguage::TYPE_FLOAT: {
-			zeromem(data, 4);
+			memset(data, 0, 4);
 		} break;
 		case ShaderLanguage::TYPE_BVEC2:
 		case ShaderLanguage::TYPE_IVEC2:
 		case ShaderLanguage::TYPE_UVEC2:
 		case ShaderLanguage::TYPE_VEC2: {
-			zeromem(data, 8);
+			memset(data, 0, 8);
 		} break;
 		case ShaderLanguage::TYPE_BVEC3:
 		case ShaderLanguage::TYPE_IVEC3:
@@ -2124,16 +2124,16 @@ _FORCE_INLINE_ static void _fill_std140_ubo_empty(ShaderLanguage::DataType type,
 		case ShaderLanguage::TYPE_IVEC4:
 		case ShaderLanguage::TYPE_UVEC4:
 		case ShaderLanguage::TYPE_VEC4: {
-			zeromem(data, 16);
+			memset(data, 0, 16);
 		} break;
 		case ShaderLanguage::TYPE_MAT2: {
-			zeromem(data, 32);
+			memset(data, 0, 32);
 		} break;
 		case ShaderLanguage::TYPE_MAT3: {
-			zeromem(data, 48);
+			memset(data, 0, 48);
 		} break;
 		case ShaderLanguage::TYPE_MAT4: {
-			zeromem(data, 64);
+			memset(data, 0, 64);
 		} break;
 
 		default: {
@@ -3412,10 +3412,10 @@ void RendererStorageRD::_multimesh_make_local(MultiMesh *multimesh) const {
 			Vector<uint8_t> buffer = RD::get_singleton()->buffer_get_data(multimesh->buffer);
 			{
 				const uint8_t *r = buffer.ptr();
-				copymem(w, r, buffer.size());
+				memcpy(w, r, buffer.size());
 			}
 		} else {
-			zeromem(w, multimesh->instances * multimesh->stride_cache * sizeof(float));
+			memset(w, 0, multimesh->instances * multimesh->stride_cache * sizeof(float));
 		}
 	}
 	uint32_t data_cache_dirty_region_count = (multimesh->instances - 1) / MULTIMESH_DIRTY_REGION_SIZE + 1;
@@ -3771,7 +3771,7 @@ Vector<float> RendererStorageRD::multimesh_get_buffer(RID p_multimesh) const {
 		{
 			float *w = ret.ptrw();
 			const uint8_t *r = buffer.ptr();
-			copymem(w, r, buffer.size());
+			memcpy(w, r, buffer.size());
 		}
 
 		return ret;
@@ -4068,7 +4068,7 @@ void RendererStorageRD::_particles_allocate_emission_buffer(Particles *particles
 	ERR_FAIL_COND(particles->emission_buffer != nullptr);
 
 	particles->emission_buffer_data.resize(sizeof(ParticleEmissionBuffer::Data) * particles->amount + sizeof(uint32_t) * 4);
-	zeromem(particles->emission_buffer_data.ptrw(), particles->emission_buffer_data.size());
+	memset(particles->emission_buffer_data.ptrw(), 0, particles->emission_buffer_data.size());
 	particles->emission_buffer = (ParticleEmissionBuffer *)particles->emission_buffer_data.ptrw();
 	particles->emission_buffer->particle_max = particles->amount;
 
@@ -5230,7 +5230,7 @@ void RendererStorageRD::skeleton_allocate_data(RID p_skeleton, int p_bones, bool
 	if (skeleton->size) {
 		skeleton->data.resize(skeleton->size * (skeleton->use_2d ? 8 : 12));
 		skeleton->buffer = RD::get_singleton()->storage_buffer_create(skeleton->data.size() * sizeof(float));
-		zeromem(skeleton->data.ptrw(), skeleton->data.size() * sizeof(float));
+		memset(skeleton->data.ptrw(), 0, skeleton->data.size() * sizeof(float));
 
 		_skeleton_make_dirty(skeleton);
 
@@ -6872,7 +6872,7 @@ RID RendererStorageRD::render_target_get_sdf_texture(RID p_render_target) {
 
 		Vector<uint8_t> pv;
 		pv.resize(16 * 4);
-		zeromem(pv.ptrw(), 16 * 4);
+		memset(pv.ptrw(), 0, 16 * 4);
 		Vector<Vector<uint8_t>> vpv;
 
 		rt->sdf_buffer_read = RD::get_singleton()->texture_create(tformat, RD::TextureView(), vpv);
@@ -7359,7 +7359,7 @@ void RendererStorageRD::_update_decal_atlas() {
 			v_offsetsv.resize(base_size);
 
 			int *v_offsets = v_offsetsv.ptrw();
-			zeromem(v_offsets, sizeof(int) * base_size);
+			memset(v_offsets, 0, sizeof(int) * base_size);
 
 			int max_height = 0;
 
@@ -8115,7 +8115,7 @@ void RendererStorageRD::_update_global_variables() {
 		if (total_regions / global_variables.buffer_dirty_region_count <= 4) {
 			// 25% of regions dirty, just update all buffer
 			RD::get_singleton()->buffer_update(global_variables.buffer, 0, sizeof(GlobalVariables::Value) * global_variables.buffer_size, global_variables.buffer_values);
-			zeromem(global_variables.buffer_dirty_regions, sizeof(bool) * total_regions);
+			memset(global_variables.buffer_dirty_regions, 0, sizeof(bool) * total_regions);
 		} else {
 			uint32_t region_byte_size = sizeof(GlobalVariables::Value) * GlobalVariables::BUFFER_DIRTY_REGION_SIZE;
 
@@ -8403,10 +8403,10 @@ RendererStorageRD::RendererStorageRD() {
 	global_variables.buffer_size = GLOBAL_GET("rendering/limits/global_shader_variables/buffer_size");
 	global_variables.buffer_size = MAX(4096, global_variables.buffer_size);
 	global_variables.buffer_values = memnew_arr(GlobalVariables::Value, global_variables.buffer_size);
-	zeromem(global_variables.buffer_values, sizeof(GlobalVariables::Value) * global_variables.buffer_size);
+	memset(global_variables.buffer_values, 0, sizeof(GlobalVariables::Value) * global_variables.buffer_size);
 	global_variables.buffer_usage = memnew_arr(GlobalVariables::ValueUsage, global_variables.buffer_size);
 	global_variables.buffer_dirty_regions = memnew_arr(bool, global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
-	zeromem(global_variables.buffer_dirty_regions, sizeof(bool) * global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
+	memset(global_variables.buffer_dirty_regions, 0, sizeof(bool) * global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
 	global_variables.buffer = RD::get_singleton()->storage_buffer_create(sizeof(GlobalVariables::Value) * global_variables.buffer_size);
 
 	material_update_list = nullptr;
diff --git a/servers/rendering/renderer_scene.h b/servers/rendering/renderer_scene.h
index 551d4f4240..db1e3d1377 100644
--- a/servers/rendering/renderer_scene.h
+++ b/servers/rendering/renderer_scene.h
@@ -49,6 +49,10 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;
 	virtual bool is_camera(RID p_camera) const = 0;
 
+	virtual RID occluder_allocate() = 0;
+	virtual void occluder_initialize(RID p_occluder) = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	virtual RID scenario_allocate() = 0;
 	virtual void scenario_initialize(RID p_rid) = 0;
 
@@ -197,8 +201,8 @@ public:
 	virtual void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) = 0;
 
 	virtual void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
 
 	virtual void update() = 0;
 	virtual void render_probes() = 0;
diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp
index c7caf71fcc..5417695ad3 100644
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@@ -109,6 +109,20 @@ bool RendererSceneCull::is_camera(RID p_camera) const {
 	return camera_owner.owns(p_camera);
 }
 
+/* OCCLUDER API */
+
+RID RendererSceneCull::occluder_allocate() {
+	return RendererSceneOcclusionCull::get_singleton()->occluder_allocate();
+}
+
+void RendererSceneCull::occluder_initialize(RID p_rid) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_initialize(p_rid);
+}
+
+void RendererSceneCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_set_mesh(p_occluder, p_vertices, p_indices);
+}
+
 /* SCENARIO API */
 
 void RendererSceneCull::_instance_pair(Instance *p_A, Instance *p_B) {
@@ -310,6 +324,8 @@ void RendererSceneCull::scenario_initialize(RID p_rid) {
 	scenario->instance_aabbs.set_page_pool(&instance_aabb_page_pool);
 	scenario->instance_data.set_page_pool(&instance_data_page_pool);
 
+	RendererSceneOcclusionCull::get_singleton()->add_scenario(p_rid);
+
 	scenario_owner.initialize_rid(p_rid, scenario);
 }
 
@@ -497,6 +513,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				scene_render->free(gi_probe->probe_instance);
 
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario && instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -514,6 +535,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 
 	if (p_base.is_valid()) {
 		instance->base_type = RSG::storage->get_base_type(p_base);
+
+		if (instance->base_type == RS::INSTANCE_NONE && RendererSceneOcclusionCull::get_singleton()->is_occluder(p_base)) {
+			instance->base_type = RS::INSTANCE_OCCLUDER;
+		}
+
 		ERR_FAIL_COND(instance->base_type == RS::INSTANCE_NONE);
 
 		switch (instance->base_type) {
@@ -588,6 +614,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				gi_probe->probe_instance = scene_render->gi_probe_instance_create(p_base);
 
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, p_base, instance->transform, instance->visible);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -655,6 +686,11 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.remove(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -684,6 +720,9 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.add(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, instance->base, instance->transform, instance->visible);
+			} break;
 			default: {
 			}
 		}
@@ -801,6 +840,12 @@ void RendererSceneCull::instance_set_visible(RID p_instance, bool p_visible) {
 		InstanceParticlesCollisionData *collision = static_cast<InstanceParticlesCollisionData *>(instance->base_data);
 		RSG::storage->particles_collision_instance_set_active(collision->instance, p_visible);
 	}
+
+	if (instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(instance->scenario->self, p_instance, instance->base, instance->transform, p_visible);
+		}
+	}
 }
 
 inline bool is_geometry_instance(RenderingServer::InstanceType p_type) {
@@ -998,6 +1043,18 @@ void RendererSceneCull::instance_geometry_set_flag(RID p_instance, RS::InstanceF
 			}
 
 		} break;
+		case RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING: {
+			instance->ignore_occlusion_culling = p_enabled;
+
+			if (instance->scenario && instance->array_index >= 0) {
+				InstanceData &idata = instance->scenario->instance_data[instance->array_index];
+				if (instance->ignore_occlusion_culling) {
+					idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+				} else {
+					idata.flags &= ~uint32_t(InstanceData::FLAG_IGNORE_OCCLUSION_CULLING);
+				}
+			}
+		} break;
 		default: {
 		}
 	}
@@ -1210,6 +1267,10 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 			heightfield_particle_colliders_update_list.insert(p_instance);
 		}
 		RSG::storage->particles_collision_instance_set_transform(collision->instance, p_instance->transform);
+	} else if (p_instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (p_instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(p_instance->scenario->self, p_instance->self, p_instance->base, p_instance->transform, p_instance->visible);
+		}
 	}
 
 	if (p_instance->aabb.has_no_surface()) {
@@ -1337,6 +1398,9 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 		if (p_instance->mesh_instance.is_valid()) {
 			idata.flags |= InstanceData::FLAG_USES_MESH_INSTANCE;
 		}
+		if (p_instance->ignore_occlusion_culling) {
+			idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+		}
 
 		p_instance->scenario->instance_data.push_back(idata);
 		p_instance->scenario->instance_aabbs.push_back(InstanceBounds(p_instance->transformed_aabb));
@@ -2119,7 +2183,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 	return animated_material_found;
 }
 
-void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 // render to mono camera
 #ifndef _3D_DISABLED
 
@@ -2164,11 +2228,14 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_
 
 	RID environment = _render_get_environment(p_camera, p_scenario);
 
-	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
+	RENDER_TIMESTAMP("Update occlusion buffer")
+	RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera->transform, camera_matrix, ortho, RendererThreadPool::singleton->thread_work_pool);
+
+	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_viewport, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
 #endif
 }
 
-void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 	// render for AR/VR interface
 #if 0
 	Camera *camera = camera_owner.getornull(p_camera);
@@ -2253,7 +2320,7 @@ void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_
 #endif
 };
 
-void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data) {
+void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, CullData *cull_data) {
 	uint32_t cull_total = cull_data->scenario->instance_data.size();
 	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
 	uint32_t cull_from = p_thread * cull_total / total_threads;
@@ -2262,7 +2329,7 @@ void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullDat
 	_frustum_cull(*cull_data, frustum_cull_result_threads[p_thread], cull_from, cull_to);
 }
 
-void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
+void RendererSceneCull::_frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
 	uint64_t frame_number = RSG::rasterizer->get_frame_number();
 	float lightmap_probe_update_speed = RSG::storage->lightmap_get_probe_capture_update_speed() * RSG::rasterizer->get_frame_delta_time();
 
@@ -2271,10 +2338,14 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 
 	RID instance_pair_buffer[MAX_INSTANCE_PAIRS];
 
+	Transform inv_cam_transform = cull_data.cam_transform.inverse();
+	float z_near = cull_data.camera_matrix->get_z_near();
+
 	for (uint64_t i = p_from; i < p_to; i++) {
 		bool mesh_visible = false;
 
-		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum)) {
+		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum) && (cull_data.occlusion_buffer == nullptr || cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING ||
+																								 !cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near))) {
 			InstanceData &idata = cull_data.scenario->instance_data[i];
 			uint32_t base_type = idata.flags & InstanceData::FLAG_BASE_TYPE_MASK;
 
@@ -2469,7 +2540,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 	}
 }
 
-void RendererSceneCull::_render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
+void RendererSceneCull::_render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
 	// Note, in stereo rendering:
 	// - p_cam_transform will be a transform in the middle of our two eyes
 	// - p_cam_projection is a wider frustrum that encompasses both eyes
@@ -2566,7 +2637,7 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		uint64_t cull_from = 0;
 		uint64_t cull_to = scenario->instance_data.size();
 
-		FrustumCullData cull_data;
+		CullData cull_data;
 
 		//prepare for eventual thread usage
 		cull_data.cull = &cull;
@@ -2575,6 +2646,8 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		cull_data.cam_transform = p_cam_transform;
 		cull_data.visible_layers = p_visible_layers;
 		cull_data.render_reflection_probe = render_reflection_probe;
+		cull_data.occlusion_buffer = RendererSceneOcclusionCull::get_singleton()->buffer_get_ptr(p_viewport);
+		cull_data.camera_matrix = &p_cam_projection;
 //#define DEBUG_CULL_TIME
 #ifdef DEBUG_CULL_TIME
 		uint64_t time_from = OS::get_singleton()->get_ticks_usec();
@@ -2781,8 +2854,13 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 	}
 	/* PROCESS GEOMETRY AND DRAW SCENE */
 
+	RID occluders_tex;
+	if (p_viewport.is_valid()) {
+		occluders_tex = RSG::viewport->viewport_get_occluder_debug_texture(p_viewport);
+	}
+
 	RENDER_TIMESTAMP("Render Scene ");
-	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
+	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, occluders_tex, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
 
 	for (uint32_t i = 0; i < max_shadows_used; i++) {
 		render_shadow_data[i].instances.clear();
@@ -2829,7 +2907,7 @@ void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario,
 		environment = scenario->fallback_environment;
 	}
 	RENDER_TIMESTAMP("Render Empty Scene ");
-	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
+	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, RID(), scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
 #endif
 }
 
@@ -2899,7 +2977,7 @@ bool RendererSceneCull::_render_reflection_probe_step(Instance *p_instance, int
 		}
 
 		RENDER_TIMESTAMP("Render Reflection Probe, Step " + itos(p_step));
-		_render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
+		_render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, RID(), shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
 
 	} else {
 		//do roughness postprocess step until it believes it's done
@@ -3473,8 +3551,11 @@ bool RendererSceneCull::free(RID p_rid) {
 		scene_render->free(scenario->reflection_probe_shadow_atlas);
 		scene_render->free(scenario->reflection_atlas);
 		scenario_owner.free(p_rid);
+		RendererSceneOcclusionCull::get_singleton()->remove_scenario(p_rid);
 		memdelete(scenario);
 
+	} else if (RendererSceneOcclusionCull::get_singleton()->is_occluder(p_rid)) {
+		RendererSceneOcclusionCull::get_singleton()->free_occluder(p_rid);
 	} else if (instance_owner.owns(p_rid)) {
 		// delete the instance
 
@@ -3543,6 +3624,8 @@ RendererSceneCull::RendererSceneCull() {
 	indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame");
 	thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances");
 	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)RendererThreadPool::singleton->thread_work_pool.get_thread_count()); //make sure there is at least one thread per CPU
+
+	dummy_occlusion_culling = memnew(RendererSceneOcclusionCull);
 }
 
 RendererSceneCull::~RendererSceneCull() {
@@ -3561,4 +3644,8 @@ RendererSceneCull::~RendererSceneCull() {
 		frustum_cull_result_threads[i].reset();
 	}
 	frustum_cull_result_threads.clear();
+
+	if (dummy_occlusion_culling) {
+		memdelete(dummy_occlusion_culling);
+	}
 }
diff --git a/servers/rendering/renderer_scene_cull.h b/servers/rendering/renderer_scene_cull.h
index d7d59665ec..36eece9dd8 100644
--- a/servers/rendering/renderer_scene_cull.h
+++ b/servers/rendering/renderer_scene_cull.h
@@ -45,8 +45,10 @@
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
 #include "servers/rendering/renderer_scene.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
 #include "servers/rendering/renderer_scene_render.h"
 #include "servers/xr/xr_interface.h"
+
 class RendererSceneCull : public RendererScene {
 public:
 	RendererSceneRender *scene_render;
@@ -109,6 +111,14 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable);
 	virtual bool is_camera(RID p_camera) const;
 
+	/* OCCLUDER API */
+
+	virtual RID occluder_allocate();
+	virtual void occluder_initialize(RID p_occluder);
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices);
+
+	RendererSceneOcclusionCull *dummy_occlusion_culling;
+
 	/* SCENARIO API */
 
 	struct Instance;
@@ -248,6 +258,7 @@ public:
 			FLAG_USES_BAKED_LIGHT = (1 << 16),
 			FLAG_USES_MESH_INSTANCE = (1 << 17),
 			FLAG_REFLECTION_PROBE_DIRTY = (1 << 18),
+			FLAG_IGNORE_OCCLUSION_CULLING = (1 << 19),
 		};
 
 		uint32_t flags = 0;
@@ -346,6 +357,8 @@ public:
 
 		float lod_bias;
 
+		bool ignore_occlusion_culling;
+
 		Vector<RID> materials;
 
 		RS::ShadowCastingSetting cast_shadows;
@@ -470,6 +483,7 @@ public:
 			lightmap = nullptr;
 			lightmap_cull_index = 0;
 			lod_bias = 1.0;
+			ignore_occlusion_culling = false;
 
 			scenario = nullptr;
 
@@ -921,24 +935,26 @@ public:
 		Frustum frustum;
 	} cull;
 
-	struct FrustumCullData {
+	struct CullData {
 		Cull *cull;
 		Scenario *scenario;
 		RID shadow_atlas;
 		Transform cam_transform;
 		uint32_t visible_layers;
 		Instance *render_reflection_probe;
+		const RendererSceneOcclusionCull::HZBuffer *occlusion_buffer;
+		const CameraMatrix *camera_matrix;
 	};
 
-	void _frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data);
-	void _frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
+	void _frustum_cull_threaded(uint32_t p_thread, CullData *cull_data);
+	void _frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
 
 	bool _render_reflection_probe_step(Instance *p_instance, int p_step);
-	void _render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
+	void _render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
 	void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas);
 
-	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
-	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
 	void update_dirty_instances();
 
 	void render_particle_colliders();
diff --git a/servers/rendering/renderer_scene_occlusion_cull.cpp b/servers/rendering/renderer_scene_occlusion_cull.cpp
new file mode 100644
index 0000000000..c491ccbe7a
--- /dev/null
+++ b/servers/rendering/renderer_scene_occlusion_cull.cpp
@@ -0,0 +1,192 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.cpp                                    */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "renderer_scene_occlusion_cull.h"
+
+RendererSceneOcclusionCull *RendererSceneOcclusionCull::singleton = nullptr;
+
+const Vector3 RendererSceneOcclusionCull::HZBuffer::corners[8] = {
+	Vector3(0, 0, 0),
+	Vector3(0, 0, 1),
+	Vector3(0, 1, 0),
+	Vector3(0, 1, 1),
+	Vector3(1, 0, 0),
+	Vector3(1, 0, 1),
+	Vector3(1, 1, 0),
+	Vector3(1, 1, 1)
+};
+
+bool RendererSceneOcclusionCull::HZBuffer::is_empty() const {
+	return sizes.is_empty();
+}
+
+void RendererSceneOcclusionCull::HZBuffer::clear() {
+	if (sizes.is_empty()) {
+		return; // Already cleared
+	}
+
+	data.clear();
+	sizes.clear();
+	mips.clear();
+
+	debug_data.clear();
+	if (debug_image.is_valid()) {
+		debug_image.unref();
+	}
+	RS::get_singleton()->free(debug_texture);
+}
+
+void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	int mip_count = 0;
+	int data_size = 0;
+	int w = p_size.x;
+	int h = p_size.y;
+
+	while (true) {
+		data_size += h * w;
+
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+
+		mip_count++;
+
+		if (w == 1U && h == 1U) {
+			data_size += 1U;
+			mip_count++;
+			break;
+		}
+	}
+
+	data.resize(data_size);
+	mips.resize(mip_count);
+	sizes.resize(mip_count);
+
+	w = p_size.x;
+	h = p_size.y;
+	float *ptr = data.ptr();
+
+	for (int i = 0; i < mip_count; i++) {
+		sizes[i] = Size2i(w, h);
+		mips[i] = ptr;
+
+		ptr = &ptr[w * h];
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+	}
+
+	for (int i = 0; i < data_size; i++) {
+		data[i] = FLT_MAX;
+	}
+
+	debug_data.resize(sizes[0].x * sizes[0].y);
+	if (debug_texture.is_valid()) {
+		RS::get_singleton()->free(debug_texture);
+		debug_texture = RID();
+	}
+}
+
+void RendererSceneOcclusionCull::HZBuffer::update_mips() {
+	if (sizes.is_empty()) {
+		return;
+	}
+
+	for (uint32_t mip = 1; mip < mips.size(); mip++) {
+		for (int y = 0; y < sizes[mip].y; y++) {
+			for (int x = 0; x < sizes[mip].x; x++) {
+				int prev_x = x * 2;
+				int prev_y = y * 2;
+
+				int prev_w = sizes[mip - 1].width;
+				int prev_h = sizes[mip - 1].height;
+
+				bool odd_w = (prev_w % 2) != 0;
+				bool odd_h = (prev_h % 2) != 0;
+
+#define CHECK_OFFSET(xx, yy) max_depth = MAX(max_depth, mips[mip - 1][MIN(prev_h - 1, prev_y + (yy)) * prev_w + MIN(prev_w - 1, prev_x + (xx))])
+
+				float max_depth = mips[mip - 1][prev_y * sizes[mip - 1].x + prev_x];
+				CHECK_OFFSET(0, 1);
+				CHECK_OFFSET(1, 0);
+				CHECK_OFFSET(1, 1);
+
+				if (odd_w) {
+					CHECK_OFFSET(2, 0);
+					CHECK_OFFSET(2, 1);
+				}
+
+				if (odd_h) {
+					CHECK_OFFSET(0, 2);
+					CHECK_OFFSET(1, 2);
+				}
+
+				if (odd_w && odd_h) {
+					CHECK_OFFSET(2, 2);
+				}
+
+				mips[mip][y * sizes[mip].x + x] = max_depth;
+#undef CHECK_OFFSET
+			}
+		}
+	}
+}
+
+RID RendererSceneOcclusionCull::HZBuffer::get_debug_texture() {
+	if (sizes.is_empty() || sizes[0] == Size2i()) {
+		return RID();
+	}
+
+	if (debug_image.is_null()) {
+		debug_image.instance();
+	}
+
+	unsigned char *ptrw = debug_data.ptrw();
+	for (int i = 0; i < debug_data.size(); i++) {
+		ptrw[i] = MIN(mips[0][i] / debug_tex_range, 1.0) * 255;
+	}
+
+	debug_image->create(sizes[0].x, sizes[0].y, false, Image::FORMAT_L8, debug_data);
+
+	if (debug_texture.is_null()) {
+		debug_texture = RS::get_singleton()->texture_2d_create(debug_image);
+	} else {
+		RenderingServer::get_singleton()->texture_2d_update_immediate(debug_texture, debug_image);
+	}
+
+	return debug_texture;
+}
diff --git a/servers/rendering/renderer_scene_occlusion_cull.h b/servers/rendering/renderer_scene_occlusion_cull.h
new file mode 100644
index 0000000000..390bbaa64b
--- /dev/null
+++ b/servers/rendering/renderer_scene_occlusion_cull.h
@@ -0,0 +1,201 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.h                                      */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef RENDERER_SCENE_OCCLUSION_CULL_H
+#define RENDERER_SCENE_OCCLUSION_CULL_H
+
+#include "core/math/camera_matrix.h"
+#include "core/templates/local_vector.h"
+#include "servers/rendering_server.h"
+
+class RendererSceneOcclusionCull {
+protected:
+	static RendererSceneOcclusionCull *singleton;
+
+public:
+	class HZBuffer {
+	protected:
+		static const Vector3 corners[8];
+
+		LocalVector<float> data;
+		LocalVector<Size2i> sizes;
+		LocalVector<float *> mips;
+
+		RID debug_texture;
+		Ref<Image> debug_image;
+		PackedByteArray debug_data;
+		float debug_tex_range = 0.0f;
+
+	public:
+		bool is_empty() const;
+		virtual void clear();
+		virtual void resize(const Size2i &p_size);
+
+		void update_mips();
+
+		_FORCE_INLINE_ bool is_occluded(const float p_bounds[6], const Vector3 &p_cam_position, const Transform &p_cam_inv_transform, const CameraMatrix &p_cam_projection, float p_near) const {
+			if (is_empty()) {
+				return false;
+			}
+
+			Vector3 closest_point = Vector3(CLAMP(p_cam_position.x, p_bounds[0], p_bounds[3]), CLAMP(p_cam_position.y, p_bounds[1], p_bounds[4]), CLAMP(p_cam_position.z, p_bounds[2], p_bounds[5]));
+
+			if (closest_point == p_cam_position) {
+				return false;
+			}
+
+			Vector3 closest_point_view = p_cam_inv_transform.xform(closest_point);
+			if (closest_point_view.z > -p_near) {
+				return false;
+			}
+
+			float min_depth;
+			if (p_cam_projection.is_orthogonal()) {
+				min_depth = (-closest_point_view.z) - p_near;
+			} else {
+				float r = -p_near / closest_point_view.z;
+				Vector3 closest_point_proj = Vector3(closest_point_view.x * r, closest_point_view.y * r, -p_near);
+				min_depth = closest_point_proj.distance_to(closest_point_view);
+			}
+
+			Vector2 rect_min = Vector2(FLT_MAX, FLT_MAX);
+			Vector2 rect_max = Vector2(FLT_MIN, FLT_MIN);
+
+			for (int j = 0; j < 8; j++) {
+				Vector3 c = RendererSceneOcclusionCull::HZBuffer::corners[j];
+				Vector3 nc = Vector3(1, 1, 1) - c;
+				Vector3 corner = Vector3(p_bounds[0] * c.x + p_bounds[3] * nc.x, p_bounds[1] * c.y + p_bounds[4] * nc.y, p_bounds[2] * c.z + p_bounds[5] * nc.z);
+				Vector3 view = p_cam_inv_transform.xform(corner);
+
+				Vector3 projected = p_cam_projection.xform(view);
+				Vector2 normalized = Vector2(projected.x * 0.5f + 0.5f, projected.y * 0.5f + 0.5f);
+				rect_min = rect_min.min(normalized);
+				rect_max = rect_max.max(normalized);
+			}
+
+			rect_max = rect_max.min(Vector2(1, 1));
+			rect_min = rect_min.max(Vector2(0, 0));
+
+			int mip_count = mips.size();
+
+			Vector2 screen_diagonal = (rect_max - rect_min) * sizes[0];
+			float size = MAX(screen_diagonal.x, screen_diagonal.y);
+			float l = Math::ceil(Math::log2(size));
+			int lod = CLAMP(l, 0, mip_count - 1);
+
+			const int max_samples = 512;
+			int sample_count = 0;
+			bool visible = true;
+
+			for (; lod >= 0; lod--) {
+				int w = sizes[lod].x;
+				int h = sizes[lod].y;
+
+				int minx = CLAMP(rect_min.x * w - 1, 0, w - 1);
+				int maxx = CLAMP(rect_max.x * w + 1, 0, w - 1);
+
+				int miny = CLAMP(rect_min.y * h - 1, 0, h - 1);
+				int maxy = CLAMP(rect_max.y * h + 1, 0, h - 1);
+
+				sample_count += (maxx - minx + 1) * (maxy - miny + 1);
+
+				if (sample_count > max_samples) {
+					return false;
+				}
+
+				visible = false;
+				for (int y = miny; y <= maxy; y++) {
+					for (int x = minx; x <= maxx; x++) {
+						float depth = mips[lod][y * w + x];
+						if (depth > min_depth) {
+							visible = true;
+							break;
+						}
+					}
+					if (visible) {
+						break;
+					}
+				}
+
+				if (!visible) {
+					return true;
+				}
+			}
+
+			return !visible;
+		}
+
+		RID get_debug_texture();
+
+		virtual ~HZBuffer(){};
+	};
+
+	static RendererSceneOcclusionCull *get_singleton() { return singleton; }
+
+	void _print_warining() {
+		WARN_PRINT_ONCE("Occlusion culling is disabled at build time.");
+	}
+
+	virtual bool is_occluder(RID p_rid) { return false; }
+	virtual RID occluder_allocate() { return RID(); }
+	virtual void occluder_initialize(RID p_occluder) {}
+	virtual void free_occluder(RID p_occluder) { _print_warining(); }
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { _print_warining(); }
+
+	virtual void add_scenario(RID p_scenario) {}
+	virtual void remove_scenario(RID p_scenario) {}
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) { _print_warining(); }
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) { _print_warining(); }
+
+	virtual void add_buffer(RID p_buffer) { _print_warining(); }
+	virtual void remove_buffer(RID p_buffer) { _print_warining(); }
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) {
+		return nullptr;
+	}
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) { _print_warining(); }
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) { _print_warining(); }
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {}
+	virtual RID buffer_get_debug_texture(RID p_buffer) {
+		_print_warining();
+		return RID();
+	}
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {}
+
+	RendererSceneOcclusionCull() {
+		singleton = this;
+	};
+
+	virtual ~RendererSceneOcclusionCull() {
+		singleton = nullptr;
+	};
+};
+
+#endif //RENDERER_SCENE_OCCLUSION_CULL_H
diff --git a/servers/rendering/renderer_scene_render.h b/servers/rendering/renderer_scene_render.h
index 9ca9574f6f..3f28fac549 100644
--- a/servers/rendering/renderer_scene_render.h
+++ b/servers/rendering/renderer_scene_render.h
@@ -216,7 +216,7 @@ public:
 		uint32_t positional_light_count;
 	};
 
-	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
+	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
 
 	virtual void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
 	virtual void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances) = 0;
diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp
index a5d5033c18..f7be6c6c60 100644
--- a/servers/rendering/renderer_viewport.cpp
+++ b/servers/rendering/renderer_viewport.cpp
@@ -79,11 +79,26 @@ void RendererViewport::_draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye) {
 		xr_interface = XRServer::get_singleton()->get_primary_interface();
 	}
 
+	if (p_viewport->use_occlusion_culling) {
+		if (p_viewport->occlusion_buffer_dirty) {
+			float aspect = p_viewport->size.aspect();
+			int max_size = occlusion_rays_per_thread * RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+
+			int viewport_size = p_viewport->size.width * p_viewport->size.height;
+			max_size = CLAMP(max_size, viewport_size / (32 * 32), viewport_size / (2 * 2)); // At least one depth pixel for every 16x16 region. At most one depth pixel for every 2x2 region.
+
+			float height = Math::sqrt(max_size / aspect);
+			Size2i new_size = Size2i(height * aspect, height);
+			RendererSceneOcclusionCull::get_singleton()->buffer_set_size(p_viewport->self, new_size);
+			p_viewport->occlusion_buffer_dirty = false;
+		}
+	}
+
 	float screen_lod_threshold = p_viewport->lod_threshold / float(p_viewport->size.width);
 	if (p_viewport->use_xr && xr_interface.is_valid()) {
-		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	} else {
-		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	}
 	RENDER_TIMESTAMP("<End Rendering 3D Scene");
 }
@@ -647,6 +662,8 @@ void RendererViewport::viewport_set_size(RID p_viewport, int p_width, int p_heig
 			RSG::scene->render_buffers_configure(viewport->render_buffers, viewport->render_target, viewport->size.width, viewport->size.height, viewport->msaa, viewport->screen_space_aa, viewport->use_debanding);
 		}
 	}
+
+	viewport->occlusion_buffer_dirty = true;
 }
 
 void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {
@@ -655,6 +672,7 @@ void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {
 
 	if (p_active) {
 		ERR_FAIL_COND(active_viewports.find(viewport) != -1); //already active
+		viewport->occlusion_buffer_dirty = true;
 		active_viewports.push_back(viewport);
 	} else {
 		active_viewports.erase(viewport);
@@ -739,6 +757,16 @@ RID RendererViewport::viewport_get_texture(RID p_viewport) const {
 	return RSG::storage->render_target_get_texture(viewport->render_target);
 }
 
+RID RendererViewport::viewport_get_occluder_debug_texture(RID p_viewport) const {
+	const Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND_V(!viewport, RID());
+
+	if (viewport->use_occlusion_culling && viewport->debug_draw == RenderingServer::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		return RendererSceneOcclusionCull::get_singleton()->buffer_get_debug_texture(p_viewport);
+	}
+	return RID();
+}
+
 void RendererViewport::viewport_set_hide_scenario(RID p_viewport, bool p_hide) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@@ -772,6 +800,9 @@ void RendererViewport::viewport_set_scenario(RID p_viewport, RID p_scenario) {
 	ERR_FAIL_COND(!viewport);
 
 	viewport->scenario = p_scenario;
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, p_scenario);
+	}
 }
 
 void RendererViewport::viewport_attach_canvas(RID p_viewport, RID p_canvas) {
@@ -888,6 +919,41 @@ void RendererViewport::viewport_set_use_debanding(RID p_viewport, bool p_use_deb
 	}
 }
 
+void RendererViewport::viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling) {
+	Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND(!viewport);
+
+	if (viewport->use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+	viewport->use_occlusion_culling = p_use_occlusion_culling;
+
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->add_buffer(p_viewport);
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, viewport->scenario);
+	} else {
+		RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_viewport);
+	}
+
+	viewport->occlusion_buffer_dirty = true;
+}
+
+void RendererViewport::viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) {
+	if (occlusion_rays_per_thread == p_rays_per_thread) {
+		return;
+	}
+
+	occlusion_rays_per_thread = p_rays_per_thread;
+
+	for (int i = 0; i < active_viewports.size(); i++) {
+		active_viewports[i]->occlusion_buffer_dirty = true;
+	}
+}
+
+void RendererViewport::viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	RendererSceneOcclusionCull::get_singleton()->set_build_quality(p_quality);
+}
+
 void RendererViewport::viewport_set_lod_threshold(RID p_viewport, float p_pixels) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@@ -985,6 +1051,10 @@ bool RendererViewport::free(RID p_rid) {
 		viewport_set_scenario(p_rid, RID());
 		active_viewports.erase(viewport);
 
+		if (viewport->use_occlusion_culling) {
+			RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_rid);
+		}
+
 		viewport_owner.free(p_rid);
 		memdelete(viewport);
 
@@ -1026,4 +1096,5 @@ void RendererViewport::call_set_use_vsync(bool p_enable) {
 }
 
 RendererViewport::RendererViewport() {
+	occlusion_rays_per_thread = GLOBAL_GET("rendering/occlusion_culling/occlusion_rays_per_thread");
 }
diff --git a/servers/rendering/renderer_viewport.h b/servers/rendering/renderer_viewport.h
index f5ed543e8d..5c372e8c9a 100644
--- a/servers/rendering/renderer_viewport.h
+++ b/servers/rendering/renderer_viewport.h
@@ -31,9 +31,9 @@
 #ifndef VISUALSERVERVIEWPORT_H
 #define VISUALSERVERVIEWPORT_H
 
+#include "core/templates/local_vector.h"
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
-#include "renderer_compositor.h"
 #include "servers/rendering_server.h"
 #include "servers/xr/xr_interface.h"
 
@@ -61,6 +61,9 @@ public:
 		RS::ViewportScreenSpaceAA screen_space_aa;
 		bool use_debanding;
 
+		bool use_occlusion_culling;
+		bool occlusion_buffer_dirty;
+
 		DisplayServer::WindowID viewport_to_screen;
 		Rect2 viewport_to_screen_rect;
 		bool viewport_render_direct_to_screen;
@@ -143,6 +146,8 @@ public:
 			msaa = RS::VIEWPORT_MSAA_DISABLED;
 			screen_space_aa = RS::VIEWPORT_SCREEN_SPACE_AA_DISABLED;
 			use_debanding = false;
+			use_occlusion_culling = false;
+			occlusion_buffer_dirty = true;
 
 			snap_2d_transforms_to_pixel = false;
 			snap_2d_vertices_to_pixel = false;
@@ -185,6 +190,10 @@ private:
 	void _draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye);
 	void _draw_viewport(Viewport *p_viewport, XRInterface::Eyes p_eye = XRInterface::EYE_MONO);
 
+	int occlusion_rays_per_thread = 512;
+
+	void _resize_occlusion_culling_buffer(const Size2i &p_size);
+
 public:
 	RID viewport_allocate();
 	void viewport_initialize(RID p_rid);
@@ -204,6 +213,7 @@ public:
 	void viewport_set_clear_mode(RID p_viewport, RS::ViewportClearMode p_clear_mode);
 
 	RID viewport_get_texture(RID p_viewport) const;
+	RID viewport_get_occluder_debug_texture(RID p_viewport) const;
 
 	void viewport_set_hide_scenario(RID p_viewport, bool p_hide);
 	void viewport_set_hide_canvas(RID p_viewport, bool p_hide);
@@ -225,7 +235,9 @@ public:
 	void viewport_set_msaa(RID p_viewport, RS::ViewportMSAA p_msaa);
 	void viewport_set_screen_space_aa(RID p_viewport, RS::ViewportScreenSpaceAA p_mode);
 	void viewport_set_use_debanding(RID p_viewport, bool p_use_debanding);
-
+	void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling);
+	void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread);
+	void viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality);
 	void viewport_set_lod_threshold(RID p_viewport, float p_pixels);
 
 	virtual int viewport_get_render_info(RID p_viewport, RS::ViewportRenderInfo p_info);
diff --git a/servers/rendering/rendering_server_default.h b/servers/rendering/rendering_server_default.h
index 683a22fd9a..324c002d6f 100644
--- a/servers/rendering/rendering_server_default.h
+++ b/servers/rendering/rendering_server_default.h
@@ -540,6 +540,10 @@ public:
 	FUNC2(camera_set_camera_effects, RID, RID)
 	FUNC2(camera_set_use_vertical_aspect, RID, bool)
 
+	/* OCCLUDER */
+	FUNCRIDSPLIT(occluder)
+	FUNC3(occluder_set_mesh, RID, const PackedVector3Array &, const PackedInt32Array &);
+
 #undef server_name
 #undef ServerName
 //from now on, calls forwarded to this singleton
@@ -590,6 +594,9 @@ public:
 	FUNC2(viewport_set_msaa, RID, ViewportMSAA)
 	FUNC2(viewport_set_screen_space_aa, RID, ViewportScreenSpaceAA)
 	FUNC2(viewport_set_use_debanding, RID, bool)
+	FUNC2(viewport_set_use_occlusion_culling, RID, bool)
+	FUNC1(viewport_set_occlusion_rays_per_thread, int)
+	FUNC1(viewport_set_occlusion_culling_build_quality, ViewportOcclusionCullingBuildQuality)
 	FUNC2(viewport_set_lod_threshold, RID, float)
 
 	FUNC2R(int, viewport_get_render_info, RID, ViewportRenderInfo)
diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp
index f8644b5ecb..1ecb471360 100644
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@@ -349,7 +349,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						for (int i = 0; i < p_vertex_array_len; i++) {
 							float vector[2] = { src[i].x, src[i].y };
 
-							copymem(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 2);
+							memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 2);
 
 							if (i == 0) {
 								aabb = Rect2(src[i], SMALL_VEC2); //must have a bit of size
@@ -374,7 +374,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						for (int i = 0; i < p_vertex_array_len; i++) {
 							float vector[3] = { src[i].x, src[i].y, src[i].z };
 
-							copymem(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 3);
+							memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 3);
 
 							if (i == 0) {
 								aabb = AABB(src[i], SMALL_VEC3);
@@ -403,7 +403,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					value |= CLAMP(int(n.y * 1023.0), 0, 1023) << 10;
 					value |= CLAMP(int(n.z * 1023.0), 0, 1023) << 20;
 
-					copymem(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
+					memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
 				}
 
 			} break;
@@ -424,7 +424,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					value |= CLAMP(int((src[i * 4 + 2] * 0.5 + 0.5) * 1023.0), 0, 1023) << 20;
 					value |= CLAMP(int((src[i * 4 + 3] * 0.5 + 0.5) * 3.0), 0, 3) << 30;
 
-					copymem(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
+					memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
 				}
 
 			} break;
@@ -442,7 +442,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					color16[1] = Math::make_half_float(src[i].g);
 					color16[2] = Math::make_half_float(src[i].b);
 					color16[3] = Math::make_half_float(src[i].a);
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], color16, 8);
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], color16, 8);
 				}
 			} break;
 			case RS::ARRAY_TEX_UV: {
@@ -457,7 +457,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 				for (int i = 0; i < p_vertex_array_len; i++) {
 					float uv[2] = { src[i].x, src[i].y };
 
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 4);
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 4);
 				}
 
 			} break;
@@ -473,7 +473,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 
 				for (int i = 0; i < p_vertex_array_len; i++) {
 					uint16_t uv[2] = { Math::make_half_float(src[i].x), Math::make_half_float(src[i].y) };
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 2);
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 2);
 				}
 			} break;
 			case RS::ARRAY_CUSTOM0:
@@ -495,7 +495,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const uint8_t *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 4], 4);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 4], 4);
 						}
 
 					} break;
@@ -510,7 +510,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const uint8_t *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 8], 8);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 8], 8);
 						}
 					} break;
 					case ARRAY_CUSTOM_R_FLOAT:
@@ -528,7 +528,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const float *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * s], 4 * s);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * s], 4 * s);
 						}
 					} break;
 					default: {
@@ -554,7 +554,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 							data[j] = CLAMP(src[i * bone_count + j] * 65535, 0, 65535);
 						}
 
-						copymem(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
+						memcpy(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
 					}
 				}
 
@@ -578,7 +578,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						max_bone = MAX(data[j], max_bone);
 					}
 
-					copymem(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
+					memcpy(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
 				}
 
 			} break;
@@ -600,11 +600,11 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					if (p_vertex_array_len < (1 << 16)) {
 						uint16_t v = src[i];
 
-						copymem(&iw[i * 2], &v, 2);
+						memcpy(&iw[i * 2], &v, 2);
 					} else {
 						uint32_t v = src[i];
 
-						copymem(&iw[i * 4], &v, 4);
+						memcpy(&iw[i * 4], &v, 4);
 					}
 				}
 			} break;
@@ -1172,7 +1172,7 @@ Array RenderingServer::_get_array_from_surface(uint32_t p_format, Vector<uint8_t
 
 						for (int j = 0; j < p_vertex_len; j++) {
 							const uint8_t *v = (const uint8_t *)&ar[j * attrib_elem_size + offsets[i]];
-							copymem(&w[j * s], v, s);
+							memcpy(&w[j * s], v, s);
 						}
 
 						ret[i] = arr;
@@ -1189,7 +1189,7 @@ Array RenderingServer::_get_array_from_surface(uint32_t p_format, Vector<uint8_t
 
 						for (int j = 0; j < p_vertex_len; j++) {
 							const float *v = (const float *)&ar[j * attrib_elem_size + offsets[i]];
-							copymem(&w[j * s], v, s * sizeof(float));
+							memcpy(&w[j * s], v, s * sizeof(float));
 						}
 						ret[i] = arr;
 
@@ -1594,7 +1594,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("gi_probe_set_compress", "probe", "enable"), &RenderingServer::gi_probe_set_compress);
 	ClassDB::bind_method(D_METHOD("gi_probe_is_compressed", "probe"), &RenderingServer::gi_probe_is_compressed);
 #endif
-/*
+	/*
 	ClassDB::bind_method(D_METHOD("lightmap_create()"), &RenderingServer::lightmap_capture_create);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_bounds", "capture", "bounds"), &RenderingServer::lightmap_capture_set_bounds);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_bounds", "capture"), &RenderingServer::lightmap_capture_get_bounds);
@@ -1607,6 +1607,10 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_energy", "capture", "energy"), &RenderingServer::lightmap_capture_set_energy);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_energy", "capture"), &RenderingServer::lightmap_capture_get_energy);
 */
+
+	ClassDB::bind_method(D_METHOD("occluder_create"), &RenderingServer::occluder_create);
+	ClassDB::bind_method(D_METHOD("occluder_set_mesh"), &RenderingServer::occluder_set_mesh);
+
 #endif
 	ClassDB::bind_method(D_METHOD("particles_create"), &RenderingServer::particles_create);
 	ClassDB::bind_method(D_METHOD("particles_set_emitting", "particles", "emitting"), &RenderingServer::particles_set_emitting);
@@ -1667,6 +1671,9 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("viewport_set_shadow_atlas_quadrant_subdivision", "viewport", "quadrant", "subdivision"), &RenderingServer::viewport_set_shadow_atlas_quadrant_subdivision);
 	ClassDB::bind_method(D_METHOD("viewport_set_msaa", "viewport", "msaa"), &RenderingServer::viewport_set_msaa);
 	ClassDB::bind_method(D_METHOD("viewport_set_use_debanding", "viewport", "enable"), &RenderingServer::viewport_set_use_debanding);
+	ClassDB::bind_method(D_METHOD("viewport_set_use_occlusion_culling", "viewport", "enable"), &RenderingServer::viewport_set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_rays_per_thread", "rays_per_thread"), &RenderingServer::viewport_set_occlusion_rays_per_thread);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_culling_build_quality", "quality"), &RenderingServer::viewport_set_occlusion_culling_build_quality);
 
 	ClassDB::bind_method(D_METHOD("viewport_get_render_info", "viewport", "info"), &RenderingServer::viewport_get_render_info);
 	ClassDB::bind_method(D_METHOD("viewport_set_debug_draw", "viewport", "draw"), &RenderingServer::viewport_set_debug_draw);
@@ -1694,6 +1701,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("scenario_create"), &RenderingServer::scenario_create);
 	ClassDB::bind_method(D_METHOD("scenario_set_debug", "scenario", "debug_mode"), &RenderingServer::scenario_set_debug);
 	ClassDB::bind_method(D_METHOD("scenario_set_environment", "scenario", "environment"), &RenderingServer::scenario_set_environment);
+	ClassDB::bind_method(D_METHOD("scenario_set_camera_effects", "scenario", "effects"), &RenderingServer::scenario_set_camera_effects);
 	ClassDB::bind_method(D_METHOD("scenario_set_fallback_environment", "scenario", "environment"), &RenderingServer::scenario_set_fallback_environment);
 
 #ifndef _3D_DISABLED
@@ -2024,6 +2032,7 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI_PROBES);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_GI_BUFFER);
+	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_OCCLUDERS);
 
 	BIND_ENUM_CONSTANT(SKY_MODE_QUALITY);
 	BIND_ENUM_CONSTANT(SKY_MODE_REALTIME);
@@ -2093,6 +2102,10 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_OVERDRAW);
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_SHADELESS);
 
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH);
+
 	BIND_ENUM_CONSTANT(INSTANCE_NONE);
 	BIND_ENUM_CONSTANT(INSTANCE_MESH);
 	BIND_ENUM_CONSTANT(INSTANCE_MULTIMESH);
@@ -2104,12 +2117,14 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(INSTANCE_DECAL);
 	BIND_ENUM_CONSTANT(INSTANCE_GI_PROBE);
 	BIND_ENUM_CONSTANT(INSTANCE_LIGHTMAP);
+	BIND_ENUM_CONSTANT(INSTANCE_OCCLUDER);
 	BIND_ENUM_CONSTANT(INSTANCE_MAX);
 	BIND_ENUM_CONSTANT(INSTANCE_GEOMETRY_MASK);
 
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_BAKED_LIGHT);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_DYNAMIC_GI);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE);
+	BIND_ENUM_CONSTANT(INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_MAX);
 
 	BIND_ENUM_CONSTANT(SHADOW_CASTING_SETTING_OFF);
@@ -2340,6 +2355,10 @@ RenderingServer::RenderingServer() {
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/amount", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/amount", PROPERTY_HINT_RANGE, "0.01,4.0,0.01"));
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/limit", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/limit", PROPERTY_HINT_RANGE, "0.01,1.0,0.01"));
 
+	GLOBAL_DEF_RST("rendering/occlusion_culling/occlusion_rays_per_thread", 512);
+	GLOBAL_DEF_RST("rendering/occlusion_culling/bvh_build_quality", 2);
+	ProjectSettings::get_singleton()->set_custom_property_info("rendering/occlusion_culling/bvh_build_quality", PropertyInfo(Variant::INT, "rendering/occlusion_culling/bvh_build_quality", PROPERTY_HINT_ENUM, "Low,Medium,High"));
+
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode", 1);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/environment/glow/upscale_mode", PropertyInfo(Variant::INT, "rendering/environment/glow/upscale_mode", PROPERTY_HINT_ENUM, "Linear (Fast),Bicubic (Slow)"));
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode.mobile", 0);
diff --git a/servers/rendering_server.h b/servers/rendering_server.h
index 694fae7fde..c74bb22aad 100644
--- a/servers/rendering_server.h
+++ b/servers/rendering_server.h
@@ -713,6 +713,11 @@ public:
 	virtual void camera_set_camera_effects(RID p_camera, RID p_camera_effects) = 0;
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;
 
+	/* OCCLUDER API */
+
+	virtual RID occluder_create() = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	/* VIEWPORT TARGET API */
 
 	enum CanvasItemTextureFilter {
@@ -826,6 +831,17 @@ public:
 
 	virtual void viewport_set_lod_threshold(RID p_viewport, float p_pixels) = 0;
 
+	virtual void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_debanding) = 0;
+	virtual void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) = 0;
+
+	enum ViewportOcclusionCullingBuildQuality {
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW = 0,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM = 1,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH = 2,
+	};
+
+	virtual void viewport_set_occlusion_culling_build_quality(ViewportOcclusionCullingBuildQuality p_quality) = 0;
+
 	enum ViewportRenderInfo {
 		VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME,
 		VIEWPORT_RENDER_INFO_VERTICES_IN_FRAME,
@@ -862,6 +878,7 @@ public:
 		VIEWPORT_DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_DECALS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		VIEWPORT_DEBUG_DRAW_OCCLUDERS,
 	};
 
 	virtual void viewport_set_debug_draw(RID p_viewport, ViewportDebugDraw p_draw) = 0;
@@ -1109,6 +1126,7 @@ public:
 		INSTANCE_DECAL,
 		INSTANCE_GI_PROBE,
 		INSTANCE_LIGHTMAP,
+		INSTANCE_OCCLUDER,
 		INSTANCE_MAX,
 
 		INSTANCE_GEOMETRY_MASK = (1 << INSTANCE_MESH) | (1 << INSTANCE_MULTIMESH) | (1 << INSTANCE_IMMEDIATE) | (1 << INSTANCE_PARTICLES)
@@ -1147,6 +1165,7 @@ public:
 		INSTANCE_FLAG_USE_BAKED_LIGHT,
 		INSTANCE_FLAG_USE_DYNAMIC_GI,
 		INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE,
+		INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING,
 		INSTANCE_FLAG_MAX
 	};
 
@@ -1505,6 +1524,7 @@ VARIANT_ENUM_CAST(RenderingServer::ViewportMSAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportScreenSpaceAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportRenderInfo);
 VARIANT_ENUM_CAST(RenderingServer::ViewportDebugDraw);
+VARIANT_ENUM_CAST(RenderingServer::ViewportOcclusionCullingBuildQuality);
 VARIANT_ENUM_CAST(RenderingServer::SkyMode);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentBG);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentAmbientSource);
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 97f21f8539..605b298ac1 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -62,6 +62,26 @@ Files extracted from upstream source:
 Extracted from .zip provided. Extracted license and header only.
 
 
+## embree-aarch64
+
+- Upstream: https://github.com/lighttransport/embree-aarch64
+- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
+- License: Apache 2.0
+
+Files extracted from upstream:
+
+- All cpp files listed in `modules/raycast/godot_update_embree.py`
+- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
+
+The `modules/raycast/godot_update_embree.py`script can be used to pull the 
+relevant files from the latest Embree-aarch64 release and apply some automatic changes.
+
+Some changes have been made in order to remove exceptions and fix minor build errors.
+They are marked with `// -- GODOT start --` and `// -- GODOT end --`
+comments. Apply the patches in the `patches/` folder when syncing on newer upstream
+commits.
+
+
 ## enet
 
 - Upstream: http://enet.bespin.org
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..01f1f80f6c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    bool ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
new file mode 100644
index 0000000000..acddc0ff81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
@@ -0,0 +1,56 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_filter.h"
+#include "../sys/regression.h"
+#include <map>
+
+namespace embree
+{
+  struct parallel_filter_regression_test : public RegressionTest
+  {
+    parallel_filter_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; };
+      
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+        size_t N0 = rand() % N;
+        
+	/* initialize array with random numbers */
+	std::vector<uint32_t> src(N);
+        std::map<uint32_t,int> m;
+	for (size_t i=0; i<N; i++) src[i] = rand();
+
+        /* count elements up */
+	for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]] = 0;
+        for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]]++;
+
+        /* filter array */
+        //size_t M = sequential_filter(src.data(),N0,N,pred);
+        size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred);
+        
+	/* check if filtered data is correct */
+	for (size_t i=N0; i<M; i++) {
+          passed &= pred(src[i]);
+          m[src[i]]--;
+        }
+	for (size_t i=N0; i<M; i++)
+          passed &= (m[src[i]] == 0);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..5823fc631f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
new file mode 100644
index 0000000000..ef070ebc4d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
@@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_regression_test : public RegressionTest
+  {
+    parallel_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          std::atomic<size_t> sum1(0);
+          parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) 
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            sum1 += s;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_for_regression_test parallel_for_regression("parallel_for_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..51d296fb16
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
@@ -0,0 +1,229 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include <dispatch/dispatch.h>
+#include <algorithm>
+#include <type_traits>
+#endif
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL)
+    if (N) {
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        });
+      if (!TaskScheduler::wait())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    }
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
+    const size_t length = N;
+    const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = (currentBlock * blockSize);
+        const size_t blockLength = std::min(length - start, blockSize);
+        const size_t end = start + blockLength;
+          
+        for(size_t i=start; i < end; i++)
+        {
+            func(i);
+        }
+    });
+      
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL)
+    TaskScheduler::spawn(first,last,minStepSize,func);
+    if (!TaskScheduler::wait())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
+    const size_t length = last - first;
+    const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
+    blockSize += blockSize % 4;
+      
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = first + (currentBlock * blockSize);
+        const size_t end = std::min<size_t>(last, start + blockSize);
+          
+        func( embree::range<Index>(start,end) );
+    });
+      
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+       // -- GODOT start --
+       // throw std::runtime_error("task cancelled");
+       abort(); 
+       // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
new file mode 100644
index 0000000000..0337611b35
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
@@ -0,0 +1,63 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_regression_test : public RegressionTest
+  {
+    parallel_for_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      size_t sum0 = 0;
+      size_t K = 0;
+      const size_t M = 1000;
+      std::vector<std::vector<size_t>* > array2(M);
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 1024;
+        K+=N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          sum0 += (*array2[i])[j] = rand();
+      }
+
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      /* add all numbers using parallel_for_for */
+      std::atomic<size_t> sum1(0);
+      parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+	  s += (*v)[i];
+          verify_k[k++]++;
+        }
+        sum1 += s;
+	return sum1;
+      });
+      passed &= (sum0 == sum1);
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 1);
+
+      /* delete vectors again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+      
+      return passed;
+    }
+  };
+
+  parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..852b8a0900
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    } 
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<array2.size(); i++) {
+	N += array2[i] ? array2[i]->size() : 0;
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<array2.size());
+	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
new file mode 100644
index 0000000000..0169d8e481
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t M = 10;
+      std::vector<atomic<size_t>> flattened;
+      typedef std::vector<std::vector<size_t>* > ArrayArray;
+      ArrayArray array2(M);
+      size_t K = 0;
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 10;
+        K += N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          (*array2[i])[j] = rand() % 10;
+      }
+  
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      ParallelForForPrefixSumState<size_t> state(array2,size_t(1));
+  
+      /* dry run only counts */
+      size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+      
+      /* create properly sized output array */
+      flattened.resize(S);
+      for (auto& a : flattened) a.store(0);
+
+      /* now we actually fill the flattened array */
+      parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          for (size_t j=0; j<(*v)[i]; j++) {
+            flattened[base+s+j]++;
+          }
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 2);
+
+      /* check if each element was assigned exactly once */
+      for (size_t i=0; i<flattened.size(); i++)
+        passed &= (flattened[i] == 1);
+      
+      /* delete arrays again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+
+      return passed;
+    }
+  };
+
+  parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..d2671d8a6a
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
new file mode 100644
index 0000000000..09dc303f81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_map.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_map_regression_test : public RegressionTest
+  {
+    parallel_map_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create key/value vectors with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> keys(N);
+      std::vector<uint32_t> vals(N);
+      for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649;
+      for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]);
+      for (size_t i=0; i<N; i++) vals[i] = 2*rand();
+      
+      /* create map */
+      parallel_map<uint32_t,uint32_t> map;
+      map.init(keys,vals);
+
+      /* check that all keys are properly mapped */
+      for (size_t i=0; i<N; i++) {
+        const uint32_t* val = map.lookup(keys[i]);
+        passed &= val && (*val == vals[i]);
+      }
+
+      /* check that these keys are not in the map */
+      for (size_t i=0; i<N; i++) {
+        passed &= !map.lookup(keys[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_map_regression_test parallel_map_regression("parallel_map_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..02e1a8f8d0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
new file mode 100644
index 0000000000..eb20c4465d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
@@ -0,0 +1,53 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_partition.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_partition_regression_test : public RegressionTest
+  {
+    parallel_partition_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      for (size_t i=0; i<100; i++)
+      {
+        /* create random permutation */
+        size_t N = std::rand() % 1000000;
+        std::vector<unsigned> array(N);
+        for (unsigned i=0; i<N; i++) array[i] = i;
+        for (auto& v : array) std::swap(v,array[std::rand()%array.size()]);
+        size_t split = std::rand() % (N+1);
+
+        /* perform parallel partitioning */
+        size_t left_sum = 0, right_sum = 0;
+        size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum,
+                                           [&] ( size_t i ) { return i < split; },
+                                           []  ( size_t& sum, unsigned v) { sum += v; },
+                                           []  ( size_t& sum, size_t v) { sum += v; },
+                                           128);
+        
+        /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum,
+                            [&] ( size_t i ) { return i < split; },
+                            []  ( size_t& left_sum, int v) { left_sum += v; });*/
+
+        /* verify result */
+        passed &= mid == split;
+        passed &= left_sum == split*(split-1)/2;
+        passed &= right_sum == N*(N-1)/2-left_sum;
+        for (size_t i=0; i<split; i++) passed &= array[i] < split;
+        for (size_t i=split; i<N; i++) passed &= array[i] >= split;
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..3b3ad7c854
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft   = 0;
+      size_t numMisplacedItemsRight  = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
new file mode 100644
index 0000000000..685952c3dc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
@@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+      
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+	/* initialize array with random numbers */
+        uint32_t sum0 = 0;
+	std::vector<uint32_t> src(N);
+	for (size_t i=0; i<N; i++) {
+	  sum0 += src[i] = rand();
+        }
+        
+	/* calculate parallel prefix sum */
+	std::vector<uint32_t> dst(N);
+	for (auto& v : dst) v = 0;
+	
+	for (size_t i=0; i<M; i++) {
+	  uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>());
+          passed &= (sum0 == sum1);
+        }
+        
+	/* check if prefix sum is correct */
+	for (size_t i=0, sum=0; i<N; sum+=src[i++])
+	  passed &= ((uint32_t)sum == dst[i]);
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..117c7a79b0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
new file mode 100644
index 0000000000..331fe4288e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_reduce.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_reduce_regression_test : public RegressionTest
+  {
+    parallel_reduce_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            return s;
+          }, 
+          [](const size_t v0, const size_t v1) {
+            return v0+v1;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      return passed;
+    }
+  };
+
+  parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..0daf94e50e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invokation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
new file mode 100644
index 0000000000..20b639c1c9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
@@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_set.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_set_regression_test : public RegressionTest
+  {
+    parallel_set_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> unsorted(N);
+      for (size_t i=0; i<N; i++) unsorted[i] = 2*rand();
+      
+      /* created set from numbers */
+      parallel_set<uint32_t> sorted;
+      sorted.init(unsorted);
+
+      /* check that all elements are in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= sorted.lookup(unsorted[i]);
+      }
+
+      /* check that these elements are not in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= !sorted.lookup(unsorted[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_set_regression_test parallel_set_regression("parallel_set_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..640beba7ec
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
new file mode 100644
index 0000000000..5e7ec79ac1
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
@@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_sort.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  template<typename Key>
+  struct RadixSortRegressionTest : public RegressionTest
+  {
+    RadixSortRegressionTest(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+	std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key));
+	std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key));
+	for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand());
+	
+	/* calculate checksum */
+	Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i];
+        
+	/* sort numbers */
+	for (size_t i=0; i<M; i++) {
+          radix_sort<Key>(src.data(),tmp.data(),N);
+        }
+	
+	/* calculate checksum */
+	Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i];
+	if (sum0 != sum1) passed = false;
+        
+	/* check if numbers are sorted */
+	for (size_t i=1; i<N; i++)
+	  passed &= src[i-1] <= src[i];
+      }
+      
+      return passed;
+    }
+  };
+
+  RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32");
+  RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..a758227c1b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
@@ -0,0 +1,457 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include "../sys/alloc.h"
+#endif
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h
new file mode 100644
index 0000000000..db46dc114f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h
new file mode 100644
index 0000000000..3f75677e68
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+
+    FileStream (FILE* file, const std::string& name = "file")
+      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      file = fopen(fileName.c_str(),"r");
+      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { if (file) fclose(file); }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = fgetc(file);
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    FILE* file;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h
new file mode 100644
index 0000000000..25580a77b8
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
new file mode 100644
index 0000000000..98dc80ad59
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
@@ -0,0 +1,51 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h
new file mode 100644
index 0000000000..e6dbd4aecc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
new file mode 100644
index 0000000000..d05be65862
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h
new file mode 100644
index 0000000000..72a7b4f2f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
new file mode 100644
index 0000000000..e8698ac56d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
@@ -0,0 +1,986 @@
+#pragma once
+
+#include "SSE2NEON.h"
+
+
+#define AVX2NEON_ABI static inline  __attribute__((always_inline))
+
+
+struct __m256d;
+
+struct __m256 {
+    __m128 lo,hi;
+    __m256() {}
+};
+
+
+
+
+struct __m256i {
+    __m128i lo,hi;
+    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
+    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
+    __m256i() {}
+};
+ 
+
+
+
+struct __m256d {
+    float64x2_t lo,hi;
+    __m256d() {}
+    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+};
+
+#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
+
+
+#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
+#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
+
+#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
+
+
+#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
+
+
+
+#define _mm_stream_load_si128 _mm_load_si128
+#define _mm256_stream_load_si256 _mm256_load_si256
+
+
+AVX2NEON_ABI
+__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
+{
+    __m128 res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
+{
+    __m128i res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
+{
+    int64x2_t y;
+    y[0] = *(int64_t *)mem_addr;
+    y[1] = 0;
+    return __m128i(y);
+}
+
+AVX2NEON_ABI
+int _mm_movemask_popcnt(__m128 a)
+{
+    return __builtin_popcount(_mm_movemask_ps(a));
+}
+
+AVX2NEON_ABI
+__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
+{
+    __m128 res;
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vnegq_f32(vfmaq_f32(c,a,b));
+}
+
+#define _mm_fnmsub_ss _mm_fnmsub_ps
+
+AVX2NEON_ABI
+__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmsq_f32(c,a,b);
+}
+
+#define _mm_fnmadd_ss _mm_fnmadd_ps
+
+
+AVX2NEON_ABI
+__m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmaq_f32(vnegq_f32(c),a,b);
+}
+
+#define _mm_fmsub_ss _mm_fmsub_ps
+#define _mm_fmadd_ps _mm_madd_ps
+#define _mm_fmadd_ss _mm_madd_ps
+
+
+
+template<int code>
+AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    v = 0;
+    v += (code & 0x10) ? a[0]*b[0] : 0;
+    v += (code & 0x20) ? a[1]*b[1] : 0;
+    v += (code & 0x40) ? a[2]*b[2] : 0;
+    v += (code & 0x80) ? a[3]*b[3] : 0;
+    float32x4_t res;
+    res[0] = (code & 0x1) ? v : 0;
+    res[1] = (code & 0x2) ? v : 0;
+    res[2] = (code & 0x4) ? v : 0;
+    res[3] = (code & 0x8) ? v : 0;
+    return res;
+}
+
+template<>
+inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    m[3] = 0;
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+template<>
+inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
+
+
+
+AVX2NEON_ABI
+__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_permutevar_ps (__m128 a, __m128i b)
+{
+    __m128 x;
+    for (int i=0;i<4;i++)
+    {
+        x[i] = a[b[i&3]];
+    }
+    return x;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setzero_si256()
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_setzero_ps()
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(0.0f);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_undefined_si256()
+{
+    return _mm256_setzero_si256();
+}
+
+AVX2NEON_ABI
+__m256 _mm256_undefined_ps()
+{
+    return _mm256_setzero_ps();
+}
+
+CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
+CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
+CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
+CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
+CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
+
+
+
+
+AVX2NEON_ABI
+__m128 _mm256_castps256_ps128 (__m256 a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_castsi128_si256 (__m128i a)
+{
+    __m256i res;
+    res.lo = a ;
+    res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm256_castsi256_si128 (__m256i a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_castps128_ps256 (__m128 a)
+{
+    __m256 res;
+    res.lo = a;
+    res.hi = vdupq_n_f32(0);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ss (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(*mem_addr);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi32 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(a);
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_ps(const __m256& v)
+{
+    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_permute_ps (const __m256& a)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
+
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi64x (long long a)
+{
+    __m256i res;
+    int64x2_t t = vdupq_n_s64(a);
+    res.lo = res.hi = __m128i(t);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
+{
+    __m256 res;
+    __m128 tmp;
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+    
+    
+    res.lo = tmp;
+    imm8 >>= 4;
+    
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+    
+    res.hi = tmp;
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_moveldup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    res.lo[2] = res.lo[3] = a.lo[2];
+    res.hi[0] = res.hi[1] = a.hi[0];
+    res.hi[2] = res.hi[3] = a.hi[2];
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256 _mm256_movehdup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[1];
+    res.lo[2] = res.lo[3] = a.lo[3];
+    res.hi[0] = res.hi[1] = a.hi[1];
+    res.hi[2] = res.hi[3] = a.hi[3];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
+{
+    __m256 res = a;
+    if (imm8 & 1) res.hi = b;
+    else res.lo = b;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+
+AVX2NEON_ABI
+__m256d _mm256_movedup_pd (__m256d a)
+{
+    __m256d res;
+    res.hi = a.hi;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_abs_epi32(__m256i a)
+{
+   __m256i res;
+   res.lo = vabsq_s32(a.lo);
+   res.hi = vabsq_s32(a.hi);
+   return res;
+}
+
+UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
+UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
+UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
+BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
+BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
+BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
+
+BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
+BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
+
+BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
+BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
+BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
+BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
+
+BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
+BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
+BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
+BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
+
+BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
+BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
+BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
+
+
+BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
+BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
+TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
+
+
+TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
+
+
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
+BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvtps_epi32(a.lo);
+    res.hi = _mm_cvtps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvttps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvttps_epi32(a.lo);
+    res.hi = _mm_cvttps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256 _mm256_loadu_ps (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = *(__m128 *)(mem_addr + 0);
+    res.hi = *(__m128 *)(mem_addr + 4);
+    return res;
+}
+#define _mm256_load_ps _mm256_loadu_ps
+
+
+AVX2NEON_ABI
+int _mm256_testz_ps (const __m256& a, const __m256& b)
+{
+    __m256 t = a;
+    if (&a != &b)
+        t = _mm256_and_ps(a,b);
+
+    __m128i l  = vshrq_n_s32(__m128i(t.lo),31);
+    __m128i h  = vshrq_n_s32(__m128i(t.hi),31);
+    return vaddvq_s32(vaddq_s32(l,h)) == 0;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_setzero_pd ()
+{
+    __m256d res;
+    res.lo = res.hi = vdupq_n_f64(0);
+    return res;
+}
+
+AVX2NEON_ABI
+int _mm256_movemask_pd (__m256d a)
+{
+    int res = 0;
+    uint64x2_t x;
+    x = uint64x2_t(a.lo);
+    res |= (x[0] >> 63) ? 1 : 0;
+    res |= (x[0] >> 63) ? 2 : 0;
+    x = uint64x2_t(a.hi);
+    res |= (x[0] >> 63) ? 4 : 0;
+    res |= (x[0] >> 63) ? 8 : 0;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
+    res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_f64(a.lo,b.lo));
+    res.hi = __m128i(vceqq_f64(a.hi,b.hi));
+    return res;
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_pd (const __m256d& a, const __m256d& b)
+{
+    __m256d t = a;
+
+    if (&a != &b)
+        t = _mm256_and_pd(a,b);
+
+    return _mm256_movemask_pd(t) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
+{
+    __m256d res;
+    uint64x2_t t = uint64x2_t(mask.lo);
+    res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
+    res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
+    t = uint64x2_t(mask.hi);
+    res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
+    res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
+    return res;
+}
+
+template<int imm8>
+__m256 __mm256_dp_ps (__m256 a, __m256 b)
+{
+    __m256 res;
+    res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
+    return res;
+}
+
+#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
+
+AVX2NEON_ABI
+double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
+{
+    switch (imm8 & 3) {
+        case 0:
+            return a.lo[0];
+        case 1:
+            return a.lo[1];
+        case 2:
+            return a.hi[0];
+        case 3:
+            return a.hi[1];
+    }
+    __builtin_unreachable();
+    return 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
+{
+    __m256d res;
+    res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
+    res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
+    res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
+    res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
+{
+    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
+{
+    __m256i res;
+    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
+    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
+    return res;
+}
+
+#define _mm256_load_si256 _mm256_loadu_si256
+
+AVX2NEON_ABI
+void _mm256_storeu_ps (float * mem_addr, __m256 a)
+{
+    *(__m128 *)(mem_addr + 0) = a.lo;
+    *(__m128 *)(mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_ps _mm256_storeu_ps
+#define _mm256_stream_ps _mm256_storeu_ps
+
+
+AVX2NEON_ABI
+void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
+{
+    *(__m128i *)((int *)mem_addr + 0) = a.lo;
+    *(__m128i *)((int *)mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_si256 _mm256_storeu_si256
+
+
+
+AVX2NEON_ABI
+__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
+{
+    __m256 res;
+    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
+    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi32 (__m128i a)
+{
+    __m256i res;
+    uint8x16_t x = uint8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi8_epi32 (__m128i a)
+{
+    __m256i res;
+    int8x16_t x = int8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu16_epi32 (__m128i a)
+{
+    __m256i res;
+    uint16x8_t x = uint16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi16_epi32 (__m128i a)
+{
+    __m256i res;
+    int16x8_t x = int16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
+{
+    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
+    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi32(a.lo,imm8);
+    res.hi = _mm_slli_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi32(a.lo,imm8);
+    res.hi = _mm_srli_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi32(a.lo,imm8);
+    res.hi = _mm_srai_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,count.lo);
+    res.hi = vshlq_s32(a.hi,count.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
+    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
+    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set1_ps(float x)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(x);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res;
+    res.lo = _mm_set_ps(e3,e2,e1,e0);
+    res.hi = _mm_set_ps(e7,e6,e5,e4);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = *mem_addr;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_cvtepi32_ps (__m256i a)
+{
+    __m256 res;
+    res.lo = _mm_cvtepi32_ps(a.lo);
+    res.hi = _mm_cvtepi32_ps(a.hi);
+    return res;
+}
+AVX2NEON_ABI
+void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
+        if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
+    }
+}
+
+AVX2NEON_ABI
+__m256d _mm256_andnot_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
+    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
+{
+    __m256 res;
+    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
+{
+    __m256i res;
+    res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
+{
+    __m256i res;
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
+{
+    __m256i res = _mm256_setzero_si256();
+    for (int i=0;i<4;i++)
+    {
+        if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    
+    return res;
+
+}
+
+
diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
new file mode 100644
index 0000000000..2013151d31
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
@@ -0,0 +1,1753 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding ARM NEON versions
+//
+// This header file does not (yet) translate *all* of the SSE intrinsics.
+// Since this is in support of a specific porting effort, I have only
+// included the intrinsics I needed to get my port to work.
+//
+// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
+//
+// If you want to improve or add to this project, send me an
+// email and I will probably approve your access to the depot.
+//
+// Project is located here:
+//
+//	https://github.com/jratcliff63367/sse2neon
+//
+// Show your appreciation for open source by sending me a bitcoin tip to the following
+// address.
+//
+// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
+// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
+//
+//
+// Contributors to this project are:
+//
+// John W. Ratcliff : jratcliffscarab@gmail.com
+// Brandon Rowlett  : browlett@nvidia.com
+// Ken Fast         : kfast@gdeb.com
+// Eric van Beurden : evanbeurden@nvidia.com
+//
+//
+// *********************************************************************************************************************
+// Release notes for January 20, 2017 version:
+//
+// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
+// The unit-tests now test 10,000 random float and int values against each intrinsic.
+//
+// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
+// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
+//
+// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
+// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
+// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
+// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
+// cast or otherwise alias the base NEON data type for intrinsic operations.
+//
+// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
+// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
+// to return the correct value.  This is now fixed.
+//
+// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
+// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
+// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
+// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
+// a build error.
+//
+// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
+// producing the correct results on NEON.  These unit tests will be added as soon as possible.
+//
+// Here is the list of new instrinsics which have been added:
+//
+// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
+// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
+// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
+// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
+// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
+// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
+// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
+// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
+// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
+// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
+// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
+// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
+// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
+// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
+//
+// *********************************************************************************************************************
+/*
+** The MIT license:
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and associated documentation files (the "Software"), to deal
+** in the Software without restriction, including without limitation the rights
+** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+** copies of the Software, and to permit persons to whom the Software is furnished
+** to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in all
+** copies or substantial portions of the Software.
+
+** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+
+#define GCC 1
+#define ENABLE_CPP_VERSION 0
+
+// enable precise emulation of _mm_min_ps and _mm_max_ps?
+// This would slow down the computation a bit, but gives consistent result with x86 SSE2.
+// (e.g. would solve a hole or NaN pixel in the rendering result)
+#define USE_PRECISE_MINMAX_IMPLEMENTATION (1)
+
+#if GCC
+#define FORCE_INLINE					inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x)					__attribute__((aligned(x)))
+#else
+#define FORCE_INLINE					inline
+#define ALIGN_STRUCT(x)					__declspec(align(x))
+#endif
+
+#include <stdint.h>
+#include "arm_neon.h"
+#if defined(__aarch64__)
+#include "constants.h"
+#endif
+
+
+#if !defined(__has_builtin)
+#define __has_builtin(x) (0)
+#endif
+
+/*******************************************************/
+/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
+/* Argument fp3 is a digit[0123] that represents the fp*/
+/* from argument "b" of mm_shuffle_ps that will be     */
+/* placed in fp3 of result. fp2 is the same for fp2 in */
+/* result. fp1 is a digit[0123] that represents the fp */
+/* from argument "a" of mm_shuffle_ps that will be     */
+/* places in fp1 of result. fp0 is the same for fp0 of */
+/* result                                              */
+/*******************************************************/
+#if defined(__aarch64__)
+#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
+#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } )
+#endif
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
+  ((fp1) << 2) | ((fp0)))
+
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+
+// union intended to allow direct access to an __m128 variable using the names that the MSVC
+// compiler provides.  This union should really only be used when trying to access the members
+// of the vector as integer values.  GCC/clang allow native access to the float members through
+// a simple array access operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
+// hit.  If it really is needed however, the original __m128 variable can be aliased with a
+// pointer to this union and used to access individual components.  The use of this union should
+// be hidden behind a macro that is used throughout the codebase to access the members instead
+// of always declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec
+{
+  float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
+  int8_t      m128_i8[16];    // as signed 8-bit integers.
+  int16_t     m128_i16[8];    // as signed 16-bit integers.
+  int32_t     m128_i32[4];    // as signed 32-bit integers.
+  int64_t     m128_i64[2];    // as signed 64-bit integers.
+  uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
+  uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
+  uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
+  uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
+  double	    m128_f64[2];    // as signed double
+} SIMDVec;
+
+// ******************************************
+// CPU stuff
+// ******************************************
+
+typedef SIMDVec __m128d;
+
+#include <stdlib.h>
+
+#ifndef _MM_MASK_MASK
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_DENORMALS_ZERO_ON 0x40
+#define _MM_MASK_DENORM 0x100
+#endif
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+#define _MM_SET_DENORMALS_ZERO_MODE(x)
+
+FORCE_INLINE void _mm_pause()
+{
+}
+
+FORCE_INLINE void _mm_mfence()
+{
+    __sync_synchronize();
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level)
+{
+   __builtin_prefetch(ptr);
+ 
+}
+
+FORCE_INLINE void* _mm_malloc(int size, int align)
+{
+    void *ptr;
+    // align must be multiple of sizeof(void *) for posix_memalign.
+    if (align < sizeof(void *)) {
+        align = sizeof(void *);
+    }
+
+    if ((align % sizeof(void *)) != 0) {
+        // fallback to malloc
+        ptr = malloc(size);
+    } else {
+        if (posix_memalign(&ptr, align, size)) {
+          return 0;
+        }
+    }
+
+    return ptr;
+}
+
+FORCE_INLINE void _mm_free(void* ptr)
+{
+        free(ptr);
+}
+
+FORCE_INLINE int _mm_getcsr()
+{
+        return 0;
+}
+
+FORCE_INLINE void _mm_setcsr(int val)
+{
+        return;
+}
+
+// ******************************************
+// Set/get methods
+// ******************************************
+
+// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
+#if defined(__aarch64__)
+FORCE_INLINE float _mm_cvtss_f32(const __m128& x)
+{
+    return x[0];
+}
+#else
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(a, 0);
+}
+#endif
+
+// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128()
+{
+  return vdupq_n_s32(0);
+}
+
+// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+  return vdupq_n_f32(0);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+#if defined(__aarch64__) 
+FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x)
+{
+    float32x4_t t = { x, y, z, w };
+    return t;
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x )
+{
+    float32x4_t t = { w, z, y, x };
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float __attribute__((aligned(16))) data[4] = { x, y, z, w };
+    return vld1q_f32(data);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
+{
+    float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
+    return vld1q_f32(data);
+}
+#endif
+
+// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+  return vdupq_n_s32(_i);
+}
+
+//Set the first lane to of 4 signed single-position, floating-point number to w
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    float32x4_t res = {_w, 0, 0, 0};
+    return res;
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32x4_t t = {i0,i1,i2,i3};
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    __m128 val = _mm_setzero_ps();
+    return vsetq_lane_f32(_w, val, 0);
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
+    return vld1q_s32(data);
+}
+#endif
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a )
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+  vst1q_lane_f32(p, a, 0);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
+{
+  *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0);
+}
+
+// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float * p)
+{
+  return vld1q_dup_f32(p);
+}
+
+// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float * p)
+{
+  return vld1q_f32(p);
+}
+
+// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
+{
+  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
+  return vld1q_f32(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float * p)
+{
+  __m128 result = vdupq_n_f32(0);
+  return vsetq_lane_f32(*p, result, 0);
+}
+
+FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p)
+{
+  return (__m128i)vld1q_s32((const int32_t*) p);
+}
+
+
+// ******************************************
+// Logic/Binary operations
+// ******************************************
+
+// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b));
+}
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+  return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vandq_s32(a, b);
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+  return (__m128)vandq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+  return (__m128)vorrq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+  return (__m128)veorq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vorrq_s32(a, b);
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+  return veorq_s32(a, b);
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
+  uint32x4_t &ia = *(uint32x4_t *)&a;
+  return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
+#else
+    
+#if defined(__aarch64__)
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    return vaddvq_u32(t2);
+#else
+  static const uint32x4_t movemask = { 1, 2, 4, 8 };
+  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+  uint32x4_t t0 = vreinterpretq_u32_f32(a);
+  uint32x4_t t1 = vtstq_u32(t0, highbit);
+  uint32x4_t t2 = vandq_u32(t1, movemask);
+  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+#endif
+    
+#endif
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a)
+{
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2)));
+    return vaddvq_u32(t2);
+    
+}
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_high_f32(a), vget_low_f32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high end of result
+// takes the higher two 32 bit values from b and swaps them and places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b)));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_low_f32(a), vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+  float32_t a0 = vgetq_lane_f32(a, 0);
+  float32_t a2 = vgetq_lane_f32(a, 2);
+  float32x2_t aVal = vdup_n_f32(a2);
+  aVal = vset_lane_f32(a0, aVal, 1);
+  return vcombine_f32(aVal, vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_low_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_high_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+  float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3));
+  float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a21, b03);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+  float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3));
+  float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a03, b21);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+  float32x2_t a10 = vget_low_f32(a);
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a10, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a01, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b01 = vrev64_f32(vget_low_f32(b));
+  return vcombine_f32(a01, b01);
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
+// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
+// The same is true on SSE as well.
+// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+template <int i>
+FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b)
+{
+#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
+  __m128 ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+# if __has_builtin(__builtin_shufflevector)
+    return __builtin_shufflevector(             \
+        a, b, (i) & (0x3), ((i) >> 2) & 0x3,
+        (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4);
+# else
+    const int i0 = (i >> 0)&0x3;
+    const int i1 = (i >> 2)&0x3;
+    const int i2 = (i >> 4)&0x3;
+    const int i3 = (i >> 6)&0x3;
+
+    if (&a == &b)
+     {
+         if (i0 == i1 && i0 == i2 && i0 == i3)
+         {
+             return (float32x4_t)vdupq_laneq_f32(a,i0);
+         }
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3,
+             (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3
+         };
+         
+         return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl);
+         
+     }
+     else
+     {
+         
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16,
+             (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16
+         };
+         
+         return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl));
+     }
+# endif //builtin(shufflevector)
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2):
+      return _mm_shuffle_ps_1032(a, b);
+      break;
+    case _MM_SHUFFLE(2, 3, 0, 1):
+      return _mm_shuffle_ps_2301(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 1, 0):
+      return _mm_shuffle_ps_3210(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 1, 1):
+      return _mm_shuffle_ps_0011(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 2, 2):
+      return _mm_shuffle_ps_0022(a, b);
+      break;
+    case _MM_SHUFFLE(2, 2, 0, 0):
+      return _mm_shuffle_ps_2200(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 0, 2):
+      return _mm_shuffle_ps_3202(a, b);
+      break;
+    case _MM_SHUFFLE(1, 1, 3, 3):
+      return _mm_shuffle_ps_1133(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 1, 0):
+      return _mm_shuffle_ps_2010(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 0, 1):
+      return _mm_shuffle_ps_2001(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 3, 2):
+      return _mm_shuffle_ps_2032(a, b);
+      break;
+    case _MM_SHUFFLE(0, 3, 2, 1):
+      return _mm_shuffle_ps_0321(a, b);
+      break;
+    case _MM_SHUFFLE(2, 1, 0, 3):
+      return _mm_shuffle_ps_2103(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 1, 0):
+      return _mm_shuffle_ps_1010(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 0, 1):
+      return _mm_shuffle_ps_1001(a, b);
+      break;
+    case _MM_SHUFFLE(0, 1, 0, 1):
+      return _mm_shuffle_ps_0101(a, b);
+      break;
+  }
+  return _mm_shuffle_ps_default<i>(a, b);
+}
+
+# if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default<i>(a,b)
+# else
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function<i>(a,b)
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vget_low_s32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end of result
+// takes the higher two 32 bit values from b and swaps them and places in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b)));
+}
+
+// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b
+// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 1);
+}
+
+// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a
+// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 3);
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_low_s32(a), vget_low_s32(a));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3)));
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b)
+{
+#if ENABLE_CPP_VERSION
+  __m128i ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+  __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3));
+  ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3);
+  return ret;
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break;
+    case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break;
+    case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break;
+    case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break;
+    case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break;
+    case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break;
+    case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break;
+    case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break;
+    case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break;
+    case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break;
+    default: return _mm_shuffle_epi32_default<i>(a, b);
+  }
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a)
+{
+  return vdupq_n_s32(vgetq_lane_s32(a, i));
+}
+
+template <int i>
+FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break;
+    case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break;
+    case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break;
+    case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break;
+    default: return _mm_shuffle_epi32_function<i>(a, a);
+  }
+}
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single<i>(a)
+
+template <int i>
+FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a)
+{
+  int16x8_t ret = (int16x8_t)a;
+  int16x4_t highBits = vget_high_s16(ret);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7);
+  return (__m128i)ret;
+}
+
+// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function<i>(a)
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const int s = (imm8 > 31) ? 0 : imm8;
+  data[0] = data[0] << s;
+  data[1] = data[1] << s;
+  data[2] = data[2] << s;
+  data[3] = data[3] << s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
+//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int shift = (imm8 > 31) ? 0 : imm8;  // Unfortunately, we need to check for this case for embree.
+    const int32x4_t s = vdupq_n_s32(-shift);
+    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s));
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+
+  const int s = (imm8 > 31) ? 0 : imm8;
+
+  data[0] = data[0] >> s;
+  data[1] = data[1] >> s;
+  data[2] = data[2] >> s;
+  data[3] = data[3] >> s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
+//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(-imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const uint32_t m = (uint32_t) ((~0U) << (32 - imm8));
+
+  for (int i = 0; i < 4; i++) {
+    uint32_t is_neg = ((uint32_t) (((data[i]) >> 31)));
+    data[i] = (data[i] >> imm8) | (m * is_neg);
+  }
+
+  return vld1q_s32(data);
+#endif
+}
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm))
+#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm))
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm))
+
+// NEON does not provide a version of this function, here is an article about some ways to repro the results.
+// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
+// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
+{
+  uint8x16_t input = (uint8x16_t)_a;
+  const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
+  uint8x8_t mask_and = vdup_n_u8(0x80);
+  int8x8_t mask_shift = vld1_s8(xr);
+
+  uint8x8_t lo = vget_low_u8(input);
+  uint8x8_t hi = vget_high_u8(input);
+
+  lo = vand_u8(lo, mask_and);
+  lo = vshl_u8(lo, mask_shift);
+
+  hi = vand_u8(hi, mask_and);
+  hi = vshl_u8(hi, mask_shift);
+
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+
+  return ((hi[0] << 8) | (lo[0] & 0xFF));
+}
+
+
+// ******************************************
+// Math operations
+// ******************************************
+
+// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+  return vsubq_s32(a, b);
+}
+
+// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+  return vaddq_f32(a, b);
+}
+
+// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+  const float32_t     b0 = vgetq_lane_f32(b, 0);
+  float32x4_t         value = vdupq_n_f32(0);
+
+  //the upper values in the result must be the remnants of <a>.
+  value = vsetq_lane_f32(b0, value, 0);
+  return vaddq_f32(a, value);
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+  return vaddq_s32(a, b);
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b);
+}
+
+// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),in);
+    
+#endif
+    // Get an initial estimate of 1/in.
+  float32x4_t reciprocal = vrecpeq_f32(in);
+
+  // We only return estimated 1/in.
+  // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps().
+
+  // TODO(LTE): We could delete these ifdef?
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  return reciprocal;
+
+}
+
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_rcp_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(BUILD_IOS) 
+  return vdivq_f32(a,b);
+#else
+  float32x4_t reciprocal = _mm_rcp_ps(b);
+    
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp.
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Another round for safety
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+    
+  return vmulq_f32(a, reciprocal);
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+  value = _mm_div_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+	
+  float32x4_t value = vrsqrteq_f32(in);
+  
+  // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values.
+  // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html
+  // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps
+  
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // one more round to get better precision
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // another round for safety
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  return value;
+}
+
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+  float32x4_t result = in;
+  
+  __m128 value = _mm_rsqrt_ps(in);
+
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vsqrtq_f32(in);
+#else
+  __m128 reciprocal = _mm_rsqrt_ps(in);
+  
+  // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf.
+  // We assign 0 to reciprocal wherever required.
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const uint32x4_t mask = vceqq_f32(in, vzero);
+  reciprocal = vbslq_f32(mask, vzero, reciprocal);
+  
+  // sqrt(x) = x * (1 / sqrt(x))
+  return vmulq_f32(in, reciprocal);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_sqrt_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(b,a),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vmaxq_f32(a, b);
+#endif
+}
+
+// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(a,b),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vminq_f32(a, b);
+#endif
+}
+
+// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+ 
+  value = _mm_max_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+
+    
+  value = _mm_min_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b )
+{
+  return vmaxq_s32(a,b);
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b )
+{
+  return vminq_s32(a,b);
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+  int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b);
+  ret = vshrq_n_s16(ret, 1);
+  return (__m128i)ret;
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point values a and b.
+//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
+{
+#if defined(__aarch64__)
+    return vpaddq_f32(a,b);
+#else
+// This does not work, no vpaddq...
+//	return (__m128) vpaddq_f32(a,b);
+        //
+        // get two f32x2_t values from a
+        // do vpadd
+        // put result in low half of f32x4 result
+        //
+        // get two f32x2_t values from b
+        // do vpadd
+        // put result in high half of f32x4 result
+        //
+        // combine
+        return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) );
+#endif
+}
+
+// ******************************************
+// Compare operations
+// ******************************************
+
+// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcltq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+  return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b));
+}
+
+// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgtq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+  return (__m128) _mm_cmpgt_ps(a,b);
+}
+
+
+// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgeq_f32(a, b);
+}
+
+// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcleq_f32(a, b);
+}
+
+// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vceqq_f32(a, b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcltq_s32(a, b);
+}
+
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+  return (__m128i) vceqq_s32(a,b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcgtq_s32(a, b);
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
+// see also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
+{
+  // Note: NEON does not have ordered compare builtin
+  // Need to compare a eq a and b eq b to check for NaN
+  // Do AND of results to get final
+  return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) );
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcltq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgtq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcleq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgeq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return !vgetq_lane_u32(value, 0);
+}
+
+// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss      _mm_comilt_ss
+#define _mm_ucomile_ss      _mm_comile_ss
+#define _mm_ucomigt_ss      _mm_comigt_ss
+#define _mm_ucomige_ss      _mm_comige_ss
+#define _mm_ucomieq_ss      _mm_comieq_ss
+#define _mm_ucomineq_ss     _mm_comineq_ss
+
+// ******************************************
+// Conversions
+// ******************************************
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+  return vcvtq_s32_f32(a);
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+  return vcvtq_f32_s32(a);
+}
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
+// It is supported on ARMv8 however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if 1
+  return vcvtnq_s32_f32(a);
+#else
+  __m128 half = vdupq_n_f32(0.5f);
+  const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+  const __m128 aPlusHalf = vaddq_f32(a, half);
+  const __m128 aRound = vsubq_f32(aPlusHalf, sign);
+  return vcvtq_s32_f32(aRound);
+#endif
+}
+
+// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+  return vgetq_lane_s32(a, 0);
+}
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+  __m128i result = vdupq_n_s32(0);
+  return vsetq_lane_s32(a, result, 0);
+}
+
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+#if defined(__aarch64__)
+    return (__m128i)a;
+#else
+  return *(const __m128i *)&a;
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+#if defined(__aarch64__)
+    return (__m128)a;
+#else
+  return *(const __m128 *)&a;
+#endif
+}
+
+// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+  return vld1q_s32((int32_t *)p);
+}
+
+FORCE_INLINE __m128d _mm_castps_pd(const __m128 a)
+{
+  return *(const __m128d *)&a;
+}
+
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+  return *(const __m128d *)&a;
+}
+// ******************************************
+// Miscellaneous Operations
+// ******************************************
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+  return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_low_s16((int16x8_t)a);
+  int16x4_t b1 = vget_low_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_low_s32(a);
+  int32x2_t b1 = vget_low_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_high_s16((int16x8_t)a);
+  int16x4_t b1 = vget_high_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_high_s32(a);
+  int32x2_t b1 = vget_high_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm)
+
+// ******************************************
+// Streaming Extensions
+// ******************************************
+
+// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+  __sync_synchronize();
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+  *p = a;
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const*p)
+{
+  // no corollary for Neon?
+}
+
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b)
+{
+  // Stick to the flipped behavior of x86.
+  int64_t __attribute__((aligned(16))) data[2] = { b, a };
+  return (__m128i)vld1q_s64(data);
+}
+
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+  return (__m128i)vmovq_n_s64(_i);
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
+{
+    int32x4_t mask = vshrq_n_s32(__m128i(c),31);
+    return vbslq_f32( uint32x4_t(mask), b, a);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr)
+{
+    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t t1 = vmovl_u8(t0);
+    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr)
+{
+    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
+    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
+    return vreinterpretq_s32_u32(t1);
+}
+
+FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr)
+{
+    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
+    int16x8_t   t1 = vmovl_s8(t0);
+    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
+    float32x4_t t3 = vcvtq_f32_s32(t2);
+    return vreinterpretq_s32_f32(t3);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr)
+{
+    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t  t1 = vmovl_u8(t0);
+    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr)
+{
+    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
+    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
+    float32x4_t t2 = vcvtq_f32_s32(t1);
+    return vreinterpretq_s32_f32(t2);
+}
+
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr)
+{
+    // No non-temporal load on a single register on ARM.
+    return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr));
+}
+
+FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a)
+{
+    // No non-temporal store on a single register on ARM.
+    vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a));
+}
+
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128 _mm_abs_ps(__m128 a)
+{
+    return vabsq_f32(a);
+}
+
+FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlaq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlsq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+  return vabsq_s32(a);
+}
+#endif  //defined(__aarch64__)
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+  return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+  return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h
new file mode 100644
index 0000000000..32452fbe72
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h
new file mode 100644
index 0000000000..29bb13912b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/bbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined (__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h
new file mode 100644
index 0000000000..f52015fb88
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<uint8_t      > Col3uc;
+  typedef Col3<float        > Col3f;
+}
diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h
new file mode 100644
index 0000000000..90df293f8e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<uint8_t      > Col4uc;
+  typedef Col4<float        > Col4f;
+}
diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h
new file mode 100644
index 0000000000..c3083e4fc0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/color.h
@@ -0,0 +1,257 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = (uint8_t)(s[3]); 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+      
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+      
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp
new file mode 100644
index 0000000000..eeff131664
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/constants.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "constants.h"
+
+namespace embree
+{
+  TrueTy True;
+  FalseTy False;
+  ZeroTy zero;
+  OneTy one;
+  NegInfTy neg_inf;
+  PosInfTy inf;
+  PosInfTy pos_inf;
+  NaNTy nan;
+  UlpTy ulp;
+  PiTy pi;
+  OneOverPiTy one_over_pi;
+  TwoPiTy two_pi;
+  OneOverTwoPiTy one_over_two_pi;
+  FourPiTy four_pi;
+  OneOverFourPiTy one_over_four_pi;
+  StepTy step;
+  ReverseStepTy reverse_step;
+  EmptyTy empty;
+  UndefinedTy undefined;
+
+#if defined(__aarch64__)
+const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
+const uint32x4_t vzero = { 0, 0, 0, 0 };
+const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
+const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
+const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
+const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
+const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
+const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
+const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
+const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
+#endif
+
+}
diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h
new file mode 100644
index 0000000000..e80abec80f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/constants.h
@@ -0,0 +1,239 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+// Math constants may not be defined in libcxx + mingw + strict C++ standard
+#if defined(__MINGW32__)
+
+// TODO(LTE): use constexpr
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154
+#endif
+
+#endif // __MINGW32__
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  extern MAYBE_UNUSED TrueTy True;
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  extern MAYBE_UNUSED FalseTy False;
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          int8_t     ( ) const { return 0; }
+    __forceinline operator uint8_t     ( ) const { return 0; }
+  }; 
+
+  extern MAYBE_UNUSED ZeroTy zero;
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          int8_t     ( ) const { return 1; }
+    __forceinline operator uint8_t     ( ) const { return 1; }
+  };
+
+  extern MAYBE_UNUSED OneTy one;
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::min(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::min(); }
+
+  };
+
+  extern MAYBE_UNUSED NegInfTy neg_inf;
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::max(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::max(); }
+  };
+
+  extern MAYBE_UNUSED PosInfTy inf;
+  extern MAYBE_UNUSED PosInfTy pos_inf;
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  extern MAYBE_UNUSED NaNTy nan;
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+
+  extern MAYBE_UNUSED UlpTy ulp;
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  extern MAYBE_UNUSED PiTy pi;
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED TwoPiTy two_pi;
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED FourPiTy four_pi;
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+  struct StepTy {
+  };
+
+  extern MAYBE_UNUSED StepTy step;
+
+  struct ReverseStepTy {
+  };
+
+  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+  struct EmptyTy {
+  };
+
+  extern MAYBE_UNUSED EmptyTy empty;
+
+  struct FullTy {
+  };
+
+  extern MAYBE_UNUSED FullTy full;
+
+  struct UndefinedTy {
+  };
+
+  extern MAYBE_UNUSED UndefinedTy undefined;
+    
+#if defined(__aarch64__)
+  extern const uint32x4_t movemask_mask;
+  extern const uint32x4_t vzero;
+  extern const uint32x4_t v0x80000000;
+  extern const uint32x4_t v0x7fffffff;
+  extern const uint32x4_t v000F;
+  extern const uint32x4_t v00F0;
+  extern const uint32x4_t v00FF;
+  extern const uint32x4_t v0F00;
+  extern const uint32x4_t v0F0F;
+  extern const uint32x4_t v0FF0;
+  extern const uint32x4_t v0FFF;
+  extern const uint32x4_t vF000;
+  extern const uint32x4_t vF00F;
+  extern const uint32x4_t vF0F0;
+  extern const uint32x4_t vF0FF;
+  extern const uint32x4_t vFF00;
+  extern const uint32x4_t vFF0F;
+  extern const uint32x4_t vFFF0;
+  extern const uint32x4_t vFFFF;
+  extern const uint8x16_t v0022;
+  extern const uint8x16_t v1133;
+  extern const uint8x16_t v0101;
+  extern const float32x4_t vOne;
+  extern const float32x4_t vmOne;
+  extern const float32x4_t vInf;
+  extern const float32x4_t vmInf;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h
new file mode 100644
index 0000000000..f06478e881
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h
new file mode 100644
index 0000000000..95df4a918d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/lbbox.h
@@ -0,0 +1,289 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h
new file mode 100644
index 0000000000..b9a382962c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/linearspace2.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h
new file mode 100644
index 0000000000..12b5bb776b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/linearspace3.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h
new file mode 100644
index 0000000000..6d54abd44d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/math.h
@@ -0,0 +1,451 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "AVX2NEON.h"
+#endif
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+#if (__MSV_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
+  }
+  __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__) 
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
+  }
+  __forceinline float rsqrt( const float x )
+  {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rsqrt_ss(a);
+#endif
+    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    return _mm_cvtss_f32(c);
+#endif
+  }
+
+#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__aarch64__) 
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__aarch64__) 
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+
+
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+
+#pragma clang fp contract(on)
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool all(bool s) { return s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+
+  template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
+#if 1//!defined(__aarch64__)
+      return msub(a,b,c*d);
+#else
+      return nmadd(c,d,a*b);
+#endif
+  }
+
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	  T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__) && !defined(__aarch64__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h
new file mode 100644
index 0000000000..032b56904e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h
new file mode 100644
index 0000000000..20c69bc62f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/quaternion.h
@@ -0,0 +1,254 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+    if (unlikely(all(cosTheta > 0.9995f))) {
+      return normalize(lerp(q0, q1, t));
+    }
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    return msub(cosPhi, q0, qperp);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h
new file mode 100644
index 0000000000..762d9cd9ea
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h
new file mode 100644
index 0000000000..6855d82b53
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h
new file mode 100644
index 0000000000..a619459e9c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec2.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h
new file mode 100644
index 0000000000..451ecd556c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec2fa.h
@@ -0,0 +1,317 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+#endif  //defined(__aarch64__) 
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+        
+#endif
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h
new file mode 100644
index 0000000000..1870321715
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3.h
@@ -0,0 +1,349 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h
new file mode 100644
index 0000000000..90f31739c2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3ba.h
@@ -0,0 +1,120 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    ALIGNED_STRUCT_(16);
+    
+    union {
+      __m128 m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ba( bool a )
+      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline Vec3ba( bool a, bool b, bool c)
+      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+  }
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h
new file mode 100644
index 0000000000..6163cfb596
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3fa.h
@@ -0,0 +1,810 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+        __m128 t = _mm_load_ps((float*)a);
+        t[3] = 0.0f;
+        return Vec3fa(t);
+#else
+      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      return Vec3fa(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return vnegq_f32(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+
+    return _mm_xor_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return _mm_abs_ps(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
+    return r;
+#else
+    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+#endif
+  }
+
+  __forceinline Vec3fa rcp  ( const Vec3fa& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#elif defined(__aarch64__)
+  __m128 reciprocal = _mm_rcp_ps(a.m128);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  return (const Vec3fa)reciprocal;
+#else
+        
+#if defined(__AVX512VL__)
+    const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+  }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+                                                                                
+#if defined(__aarch64__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128, b.m128, c.m128);  //a*b+c;
+    }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_msub_ps(a.m128, b.m128, c.m128);  //-a*b+c;
+    }
+  __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
+        return -t;
+    }
+  __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
+    }
+
+#else
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+#endif
+
+#endif
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  __forceinline float reduce_add(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+    t[3] = 0.0f;
+    return vaddvq_f32(t);
+  }
+                                                                                
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vminvq_f32(t);
+  }
+  __forceinline float reduce_max(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vmaxvq_f32(t);
+  }
+#else
+  __forceinline float reduce_add(const Vec3fa& v) {
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c);
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
+                                                                                
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+ #if defined(__aarch64__)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fa& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fa& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3fa Vec3fa_t;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; union { int a; unsigned u; float w; }; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+        
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
+
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
+#if defined (__aarch64__)
+      m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
+      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+      const vint4 mask(-1,-1,-1,0);
+      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+    }
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    
+    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      return Vec3fx(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) {
+    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+  }
+
+  __forceinline Vec3fx rcp  ( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) {
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c);
+  }
+
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE4_1__) && !defined(__aarch64__)
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  
+  typedef Vec3fx Vec3ff;
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h
new file mode 100644
index 0000000000..737f67fd72
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3ia.h
@@ -0,0 +1,210 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    union {
+      __m128i m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+    __forceinline operator const __m128i&() const { return m128; }
+    __forceinline operator       __m128i&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if (defined(__aarch64__)) 
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+#if !defined(__ARM_NEON)
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+#endif
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+#if !defined(__ARM_NEON)
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline int reduce_add(const Vec3ia& v) {
+    int32x4_t t = v.m128;
+    t[3] = 0;
+    return vaddvq_s32(t);
+        
+  }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
+    return vminvq_s32(t);
+        
+  }
+  __forceinline int reduce_max(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
+    return vmaxvq_s32(t);
+        
+  }
+#else
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h
new file mode 100644
index 0000000000..d16542f507
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec4.h
@@ -0,0 +1,258 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "vec3.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 4D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec4
+  {
+    enum { N = 4 };    
+    union {
+      struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ) {}
+    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+    __forceinline Vec4( const Vec3fx& other );
+
+    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
+    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
+    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Swizzles
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
+
+  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
+  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    if (a.w != b.w) return a.w < b.w;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T>
+    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec4<bool         > Vec4b;
+  typedef Vec4<uint8_t      > Vec4uc;
+  typedef Vec4<int          > Vec4i;
+  typedef Vec4<float        > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
+    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat8>& a, const size_t k ) {
+    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
+  }
+  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat4>& a, const size_t k ) {
+    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
+  }
+  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat8>& a, const size_t k ) {
+    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h
new file mode 100644
index 0000000000..c840e41805
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/avx.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h
new file mode 100644
index 0000000000..25414ab5b1
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Prefetching
+  ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1   0
+#define PFHINT_L2   1
+#define PFHINT_NT   2
+
+  template<const unsigned int mode>
+    __forceinline void prefetch(const void * __restrict__ const m)
+  {
+    if (mode == PFHINT_L1)
+      _mm_prefetch((const char*)m,_MM_HINT_T0); 
+    else if (mode == PFHINT_L2) 
+      _mm_prefetch((const char*)m,_MM_HINT_T1); 
+    else if (mode == PFHINT_NT) 
+      _mm_prefetch((const char*)m,_MM_HINT_NTA); 
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h
new file mode 100644
index 0000000000..647851110b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__) || defined(__ARM_NEON)
+#  include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+#  include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+#  include "avx512.h"
+#endif
+
+namespace embree
+{
+  template <int N>
+  __forceinline vbool<N> isfinite(const vfloat<N>& v)
+  {
+    return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+         & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+  }
+  
+  /* foreach unique */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i);
+    }
+  }
+
+  /* returns the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return i;
+  }
+
+  /* foreach unique index */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i, j);
+    }
+  }
+
+  /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return j;
+  }
+
+  template<typename Closure>
+  __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+  {
+    __aligned(64) int U[2*VSIZEX];
+    __aligned(64) int V[2*VSIZEX];
+    int index = 0;
+    for (int y=y0; y<y1; y++) {
+      const bool lasty = y+1>=y1;
+      const vintx vy = y;
+      for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+        const bool lastx = x+VSIZEX >= x1;
+        vintx vx = x+vintx(step);
+        vintx::storeu(&U[index], vx);
+        vintx::storeu(&V[index], vy);
+        const int dx = min(x1-x,VSIZEX);
+        index += dx;
+        x += dx;
+        if (index >= VSIZEX || (lastx && lasty)) {
+          const vboolx valid = vintx(step) < vintx(index);
+          closure(valid, vintx::load(U), vintx::load(V));
+          x-= max(0, index-VSIZEX);
+          index = 0;
+        }
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp
new file mode 100644
index 0000000000..1732cfa421
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree 
+{
+  const __m128 mm_lookupmask_ps[16] = {
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+  const __m128d mm_lookupmask_pd[4] = {
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+}
diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h
new file mode 100644
index 0000000000..6bc818b55b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree 
+{
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_blendv_ps(f,t,mask);
+  }
+#else
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); 
+  }
+#endif
+
+  extern const __m128  mm_lookupmask_ps[16];
+  extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h
new file mode 100644
index 0000000000..9a46817da9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/varying.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+  /* Varying numeric types */
+  template<int N>
+  struct vfloat
+  {
+    union { float f[N]; int i[N]; };
+    __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vdouble
+  {
+    union { double f[N]; long long i[N]; };
+    __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       double& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vint
+  {
+    int i[N];
+    __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vuint
+  {
+    unsigned int i[N];
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vllong
+  {
+    long long i[N];
+    __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       long long& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  /* Varying bool types */
+  template<int N> struct vboolf { int       i[N]; }; // for float/int
+  template<int N> struct vboold { long long i[N]; }; // for double/long long
+
+  /* Aliases to default types */
+  template<int N> using vreal = vfloat<N>;
+  template<int N> using vbool = vboolf<N>;
+
+  /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+  const int VSIZEX = 8;  // default size
+  const int VSIZEL = 16; // large size
+#elif defined(__AVX512F__) // KNL
+  const int VSIZEX = 16;
+  const int VSIZEL = 16;
+#elif defined(__AVX__)
+  const int VSIZEX = 8;
+  const int VSIZEL = 8;
+#else
+  const int VSIZEX = 4;
+  const int VSIZEL = 4;
+#endif
+
+  /* Extends varying size N to optimal or up to max(N, N2) */
+  template<int N, int N2 = VSIZEX>
+  struct vextend
+  {
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+    /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */
+    static const int size = (N2 == VSIZEX) ? VSIZEX : N;
+    #define SIMD_MODE(N) N, 16
+#else
+    /* calculate with same SIMD width otherwise */
+    static const int size = N;
+    #define SIMD_MODE(N) N, N
+#endif
+  };
+
+  /* 4-wide shortcuts */
+  typedef vfloat<4>  vfloat4;
+  typedef vdouble<4> vdouble4;
+  typedef vreal<4>   vreal4;
+  typedef vint<4>    vint4;
+  typedef vuint<4>  vuint4;
+  typedef vllong<4>  vllong4;
+  typedef vbool<4>   vbool4;
+  typedef vboolf<4>  vboolf4;
+  typedef vboold<4>  vboold4;
+
+  /* 8-wide shortcuts */
+  typedef vfloat<8>  vfloat8;
+  typedef vdouble<8> vdouble8;
+  typedef vreal<8>   vreal8;
+  typedef vint<8>    vint8;
+  typedef vuint<8>    vuint8;
+  typedef vllong<8>  vllong8;
+  typedef vbool<8>   vbool8;
+  typedef vboolf<8>  vboolf8;
+  typedef vboold<8>  vboold8;
+
+  /* 16-wide shortcuts */
+  typedef vfloat<16>  vfloat16;
+  typedef vdouble<16> vdouble16;
+  typedef vreal<16>   vreal16;
+  typedef vint<16>    vint16;
+  typedef vuint<16>   vuint16;
+  typedef vllong<16>  vllong16;
+  typedef vbool<16>   vbool16;
+  typedef vboolf<16>  vboolf16;
+  typedef vboold<16>  vboold16;
+
+  /* Default shortcuts */
+  typedef vfloat<VSIZEX>  vfloatx;
+  typedef vdouble<VSIZEX> vdoublex;
+  typedef vreal<VSIZEX>   vrealx;
+  typedef vint<VSIZEX>    vintx;
+  typedef vuint<VSIZEX>   vuintx;
+  typedef vllong<VSIZEX>  vllongx;
+  typedef vbool<VSIZEX>   vboolx;
+  typedef vboolf<VSIZEX>  vboolfx;
+  typedef vboold<VSIZEX>  vbooldx;
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h
new file mode 100644
index 0000000000..6505ee56f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX bool type for 64bit data types*/
+  template<>
+  struct vboold<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 };       // number of SIMD elements
+    union {                   // data
+      __m256d v;
+      struct { __m128d vl,vh; };
+      long long i[4];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& a) { v = a.v; }
+    __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+    __forceinline vboold(__m256d a) : v(a) {}
+    __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+    __forceinline operator const __m256d() const { return v; }
+
+    __forceinline vboold(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi64x(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+      vl = mm_lookupmask_pd[a & 0x3];
+      vh = mm_lookupmask_pd[a >> 2];
+#endif
+    }
+    
+    __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+#if !defined(__aarch64__)
+    __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
+#endif
+      
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool       operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+    __forceinline long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+  __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+    return _mm256_blendv_pd(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+#endif
+
+#if defined(__AVX2__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+  __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+  __forceinline void set  (vboold4& a, size_t index)     { a[index] = -1; }
+  __forceinline void clear(vboold4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h
new file mode 100644
index 0000000000..4fe730d713
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h
@@ -0,0 +1,140 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboold<4>
+  {
+    typedef vboold4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& t) { v = t.v; }
+    __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x0) {}
+    __forceinline vboold(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboold4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h
new file mode 100644
index 0000000000..fdf3f00de5
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboold<8>
+  {
+    typedef vboold8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold8& t) { v = t.v; }
+    __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8& t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+    
+    __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+#if defined(__AVX512BW__)
+      return _mm_movm_epi8(v);
+#else
+      const __m512i f = _mm512_set1_epi64(0);
+      const __m512i t = _mm512_set1_epi64(-1);
+      const __m512i m =  _mm512_mask_or_epi64(f,v,t,t); 
+      return _mm512_cvtepi64_epi8(m);
+#endif
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const { 
+#if defined(__AVX512DQ__)
+      return _mm512_movm_epi64(v);
+#else
+      const __m512i f = _mm512_set1_epi64(0);
+      const __m512i t = _mm512_set1_epi64(-1);
+      return _mm512_mask_or_epi64(f,v,t,t); 
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x00) {}
+    __forceinline vboold(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+  __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+  __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+  __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboold8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h
new file mode 100644
index 0000000000..238cdc8eb9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 16-wide AVX-512 bool type */
+  template<>
+  struct vboolf<16>
+  {
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum { size = 16 }; // number of SIMD elements
+    __mmask16 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf16& t) { v = t.v; }
+    __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask16& t) { v = t; }
+    __forceinline operator __mmask16() const { return v; }
+    
+    __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+    __forceinline vboolf(int t) { v = (__mmask16)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+#if defined(__AVX512BW__)
+      return _mm_movm_epi8(v);
+#else
+      const __m512i f = _mm512_set1_epi32(0);
+      const __m512i t = _mm512_set1_epi32(-1);
+      const __m512i m =  _mm512_mask_or_epi32(f,v,t,t);
+      return _mm512_cvtepi32_epi8(m);
+#endif
+    }
+
+    /* return int32 mask */
+    __forceinline __m512i mask32() const {
+#if defined(__AVX512DQ__)
+      return _mm512_movm_epi32(v);
+#else
+      const __m512i f = _mm512_set1_epi32(0);
+      const __m512i t = _mm512_set1_epi32(-1);
+      return _mm512_mask_or_epi32(f,v,t,t);
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0000) {}
+    __forceinline vboolf(TrueTy)  : v(0xffff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+  
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  
+   ////////////////////////////////////////////////////////////////////////////////
+   /// Binary Operators
+   ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+  __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+  __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+
+  __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+  __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Convertion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf16& a, size_t index)       { assert(index < 16); a |= 1 << index; }
+  __forceinline void clear(vboolf16& a, size_t index)     { assert(index < 16); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<16; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h
new file mode 100644
index 0000000000..2ae4c4470e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h
@@ -0,0 +1,143 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboolf<4>
+  {
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& t) { v = t.v; }
+    __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0) {}
+    __forceinline vboolf(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboolf4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h
new file mode 100644
index 0000000000..ed53b3c783
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h
@@ -0,0 +1,198 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide SSE bool type */
+  template<>
+  struct vboolf<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };            // number of SIMD elements
+    union { __m128 v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& other) { v = other.v; }
+    __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+    __forceinline vboolf(__m128 input) : v(input) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    
+    __forceinline vboolf(bool a)
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b)
+      : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; }
+      __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; }
+#else
+    __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+    __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+#endif
+    /* return int32 mask */
+    __forceinline __m128i mask32() const { 
+      return _mm_castps_si128(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; }
+      __forceinline int& operator [](size_t index)       { return i[index]; }
+#else
+    __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  
+  __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
+    return _mm_blendv_ps(f, t, m); 
+#else
+    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+                                                                
+  template<int i0>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return shuffle<i0,i0,i0,i0>(v);
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__) && !defined(__aarch64__)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+  __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__aarch64__) && defined(BUILD_IOS)
+__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); }
+#else
+#if defined(__SSE4_2__)
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+  __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf4& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h
new file mode 100644
index 0000000000..4f64741b55
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX bool type */
+  template<>
+  struct vboolf<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };       // number of SIMD elements
+    union {                   // data
+      __m256 v;
+      struct { __m128 vl,vh; };
+      int i[8];
+    };  
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& a) { v = a.v; }
+    __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+    __forceinline vboolf(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+    __forceinline vboolf(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi32(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+      vl = mm_lookupmask_ps[a & 0xF];
+      vh = mm_lookupmask_ps[a >> 4];
+#endif
+    }
+
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const { 
+      return _mm256_castps_si256(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+#if !defined(__aarch64__)
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {}
+#endif
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+  __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+    return _mm256_blendv_ps(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+  __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf8& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf8& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h
new file mode 100644
index 0000000000..2a52b554c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h
@@ -0,0 +1,143 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboolf<8>
+  {
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& t) { v = t.v; }
+    __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+      : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const {
+      return _mm256_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const {
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x00) {}
+    __forceinline vboolf(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+  __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboolf8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h
new file mode 100644
index 0000000000..1f65b45d7e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h
@@ -0,0 +1,324 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 4-wide AVX 64-bit double type */
+  template<>
+  struct vdouble<4>
+  {
+    ALIGNED_STRUCT_(32);
+            
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256d v; 
+      double i[4]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble4& t) { v = t.v; }
+    __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m256d& t) { v = t; }
+    __forceinline operator __m256d() const { return v; }
+
+    __forceinline vdouble(double i) {
+      v = _mm256_set1_pd(i);
+    }
+    
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm256_set_pd(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm256_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+      _mm256_stream_pd(ptr, a);
+    }
+
+    static __forceinline vdouble4 loadu(const double* addr) {
+      return _mm256_loadu_pd(addr);
+    }
+
+    static __forceinline vdouble4 load(const vdouble4* addr) {
+      return _mm256_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble4 load(const double* addr) {
+      return _mm256_load_pd(addr);
+    }
+
+    static __forceinline void store(double* ptr, const vdouble4& v) {
+      _mm256_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(double* ptr, const vdouble4& v) {
+      _mm256_storeu_pd(ptr, v);
+    }
+
+    static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       double& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+  __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
+  __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
+
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
+  __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
+
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
+  __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
+
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
+  __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
+
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
+  __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
+
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
+  __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
+  
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
+  __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
+  __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+  __forceinline vdouble4& operator +=(vdouble4& a, double          b) { return a = a + b; }
+  
+  __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+  __forceinline vdouble4& operator -=(vdouble4& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+  __forceinline vdouble4& operator *=(vdouble4& a, double          b) { return a = a * b; }
+  
+  __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+  __forceinline vdouble4& operator &=(vdouble4& a, double          b) { return a = a & b; }
+  
+  __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+  __forceinline vdouble4& operator |=(vdouble4& a, double          b) { return a = a | b; }
+  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#elif !defined(__aarch64__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
+#endif
+
+  __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
+  __forceinline vboold4 operator ==(double          a, const vdouble4& b) { return vdouble4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vdouble4& a, double          b) { return a != vdouble4(b); }
+  __forceinline vboold4 operator !=(double          a, const vdouble4& b) { return vdouble4(a) != b; }
+
+  __forceinline vboold4 operator < (const vdouble4& a, double          b) { return a <  vdouble4(b); }
+  __forceinline vboold4 operator < (double          a, const vdouble4& b) { return vdouble4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vdouble4& a, double          b) { return a >= vdouble4(b); }
+  __forceinline vboold4 operator >=(double          a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+  __forceinline vboold4 operator > (const vdouble4& a, double          b) { return a >  vdouble4(b); }
+  __forceinline vboold4 operator > (double          a, const vdouble4& b) { return vdouble4(a) >  b; }
+
+  __forceinline vboold4 operator <=(const vdouble4& a, double          b) { return a <= vdouble4(b); }
+  __forceinline vboold4 operator <=(double          a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+  __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+  __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+  __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a >  b; }
+  __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+ 
+  __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+    return _mm256_mask_blend_pd(m, f, t);
+#else
+    return _mm256_blendv_pd(f, t, m);
+#endif
+  }
+
+  __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) {
+    const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) {
+#if defined(__AVX512VL__)
+    return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b));
+#else
+    return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle2(const vdouble4& v) {
+    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+  }
+
+  __forceinline double toScalar(const vdouble4& v) {
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+  __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h
new file mode 100644
index 0000000000..4eec7d2f6a
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h
@@ -0,0 +1,356 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 64-bit double type */
+  template<>
+  struct vdouble<8>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {              // data
+      __m512d v;
+      double i[8];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble8& t) { v = t.v; }
+    __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m512d& t) { v = t; }
+    __forceinline operator __m512d() const { return v; }
+    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+    __forceinline vdouble(double i) {
+      v = _mm512_set1_pd(i);
+    }
+
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm512_set4_pd(d,c,b,a);
+    }
+
+    __forceinline vdouble(double a0, double a1, double a2, double a3,
+                          double a4, double a5, double a6, double a7)
+    {
+      v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm512_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+      _mm512_stream_pd((double*)ptr, a);
+    }
+
+    static __forceinline vdouble8 loadu(const void* addr) {
+      return _mm512_loadu_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const vdouble8* addr) {
+      return _mm512_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const double* addr) {
+      return _mm512_load_pd(addr);
+    }
+
+    static __forceinline void store(void* ptr, const vdouble8& v) {
+      _mm512_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vdouble8& v) {
+      _mm512_storeu_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+      _mm512_mask_storeu_pd(ptr, mask, f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+      _mm512_mask_store_pd(addr, mask, v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) {
+      _mm512_mask_compressstoreu_pd(addr, mask, reg);
+    }
+
+    static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+      return _mm512_mask_compress_pd(a, mask, b);
+    }
+
+    static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline       double& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+  __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
+  __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
+
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
+  __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
+
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
+  __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
+
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
+  __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
+
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
+  __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
+
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
+  __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
+  __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
+  __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+  __forceinline vdouble8& operator +=(vdouble8& a, double          b) { return a = a + b; }
+
+  __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+  __forceinline vdouble8& operator -=(vdouble8& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+  __forceinline vdouble8& operator *=(vdouble8& a, double          b) { return a = a * b; }
+
+  __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+  __forceinline vdouble8& operator &=(vdouble8& a, double          b) { return a = a & b; }
+
+  __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+  __forceinline vdouble8& operator |=(vdouble8& a, double          b) { return a = a | b; }
+
+  __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+  __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
+  __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
+
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
+  __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
+
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
+  __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
+
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
+  __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
+  __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
+  __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+    return _mm512_mask_or_pd(f,m,t,t);
+  }
+
+  __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) {
+    const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) {
+    return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b));
+  }
+
+  __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) {
+    return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return shuffle4<i, i>(v);
+  }
+  
+  template<int i>
+  __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+  }
+
+  __forceinline double toScalar(const vdouble8& v) {
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 vreduce_add2(vdouble8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline vdouble8 vreduce_min2(vdouble8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vdouble8 vreduce_max2(vdouble8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+  __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+    return _mm512_permutexvar_pd(index, v);
+  }
+
+  __forceinline vdouble8 reverse(const vdouble8& a) {
+    return permute(a, vllong8(reverse_step));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h
new file mode 100644
index 0000000000..aed2419b77
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h
@@ -0,0 +1,771 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 16-wide AVX-512 float type */
+  template<>
+  struct vfloat<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512 v; 
+      float f[16];
+      int i[16];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+        
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+    __forceinline vfloat(const __m512& t) { v = t; }
+    __forceinline operator __m512() const { return v; }
+    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline vfloat(float f) {
+      v = _mm512_set1_ps(f);
+    }
+
+    __forceinline vfloat(float a, float b, float c, float d) {
+      v = _mm512_set4_ps(a, b, c, d);
+    }
+
+    __forceinline vfloat(const vfloat4& i) {
+      v = _mm512_broadcast_f32x4(i);
+    }
+
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+      v = _mm512_castps128_ps512(a);
+      v = _mm512_insertf32x4(v, b, 1);
+      v = _mm512_insertf32x4(v, c, 2);
+      v = _mm512_insertf32x4(v, d, 3);
+    }
+
+    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+      v = _mm512_broadcast_f32x4(a);
+      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+    }
+
+    __forceinline vfloat(const vfloat8& i) {
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+    }
+
+    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+      v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+      v = _mm512_insertf32x8(v, b, 1);
+#else
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+    }
+
+    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+    __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+      v = _mm512_castpd_ps(aa);
+    }
+    
+    __forceinline explicit vfloat(const vint16& a) {
+      v = _mm512_cvtepi32_ps(a);
+    }
+
+    __forceinline explicit vfloat(const vuint16& a) {
+      v = _mm512_cvtepu32_ps(a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
+    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+      _mm512_stream_ps((float*)ptr,a);
+    }
+
+    static __forceinline vfloat16 broadcast(const float* f) {
+      return _mm512_set1_ps(*f);
+    }
+
+    static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) {
+      return _mm512_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) {
+      return _mm512_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) {
+      return _mm512_mask_expand_ps(b, mask, a);
+    }
+
+    static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) {
+      return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr);
+    }
+
+    static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) {
+      _mm512_mask_compressstoreu_ps(addr, mask, reg);
+    }
+    
+    static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) {
+      //_mm512_mask_compressstoreu_ps(addr,mask,reg);
+      *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg));
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+      return _mm512_i32gather_ps(index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+      vfloat16 r = zero;
+      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_i32scatter_ps(ptr, index, v, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
+    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+
+  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat16 rcp(const vfloat16& a) {
+#if defined(__AVX512ER__)
+    return _mm512_rcp28_ps(a);
+#else
+    const vfloat16 r = _mm512_rcp14_ps(a);
+    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+#endif
+  }
+
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+  __forceinline vfloat16 rsqrt(const vfloat16& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat16 r = _mm512_rsqrt14_ps(a);
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+#else
+    return _mm512_rsqrt28_ps(a);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
+  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
+
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
+  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
+
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
+  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
+
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
+  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
+  
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+  }
+  
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) {
+    return _mm512_min_ps(a,b); 
+  }
+  __forceinline vfloat16 min(const vfloat16& a, float b) {
+    return _mm512_min_ps(a,vfloat16(b));
+  }
+  __forceinline vfloat16 min(const float& a, const vfloat16& b) {
+    return _mm512_min_ps(vfloat16(a),b);
+  }
+
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) {
+    return _mm512_max_ps(a,b); 
+  }
+  __forceinline vfloat16 max(const vfloat16& a, float b) {
+    return _mm512_max_ps(a,vfloat16(b));
+  }
+  __forceinline vfloat16 max(const float& a, const vfloat16& b) {
+    return _mm512_max_ps(vfloat16(a),b);
+  }
+
+  __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); }
+  __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_mask_min_ps(c,mask,a,b); 
+  }; 
+  __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_mask_max_ps(c,mask,a,b); 
+  }; 
+
+  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+#if !defined(__AVX512ER__) // SKX
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_min_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+#else // KNL
+    return min(a,b);
+#endif
+  }
+
+  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+#if !defined(__AVX512ER__) // SKX
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_max_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+#else // KNL
+    return max(a,b);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+
+  __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); }
+  
+  __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); }
+  __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); }
+  __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Operators with rounding
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 madd_round_up  (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mul_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 add_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 sub_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 div_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_msub_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+  
+  __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_mul_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_sub_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
+  
+  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
+  
+  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
+
+  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
+  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
+
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
+  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
+
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
+  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
+
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
+  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
+  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
+  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  
+  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+    return _mm512_mask_blend_ps(s, f, t);
+  }
+
+  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)
+  {
+    vfloat16 c = a;
+    a = select(m,b,a);
+    b = select(m,c,b); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 floor(const vfloat16& a) {
+    return _mm512_floor_ps(a);
+  }
+  __forceinline vfloat16 ceil (const vfloat16& a) {
+    return _mm512_ceil_ps(a);
+  }
+  __forceinline vfloat16 round (const vfloat16& a) {
+    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  }
+  __forceinline vint16 floori (const vfloat16& a) {
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) {
+    /* mask should be 8-bit but is 16-bit to reuse for interleave_even */
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) {
+    /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+  }
+
+  __forceinline vfloat16 reverse(const vfloat16& v) {
+    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+
+  template<int i>
+  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+ 
+  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+    vfloat16 z = zero;
+    return mask_align_shift_right<15>(0xfffe,z,a,a);
+  }
+
+  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+    return align_shift_right<1>(zero,x);
+  }
+
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+  template<int N, int i>
+  vfloat<N> extractN(const vfloat16& v);
+
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+    vfloat16 a0a1_c0c1 = interleave_even(r0, r1);
+    vfloat16 a2a3_c2c3 = interleave_even(r2, r3);
+    vfloat16 b0b1_d0d1 = interleave_odd (r0, r1);
+    vfloat16 b2b3_d2d3 = interleave_odd (r2, r3);
+
+    c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3);
+    c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3);
+    c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3);
+    c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3);
+#else
+    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+#endif
+  }
+
+  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
+                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
+                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
+                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+                     c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+  }
+
+  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
+                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
+                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
+                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
+                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+                     c0, c1, c2, c3, c4, c5, c6, c7);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+ 
+  __forceinline size_t select_min(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_max(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
+    const vbool16 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
+    const vbool16 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<16-1>(v,z));
+    v = min(v,align_shift_right<16-2>(v,z));
+    v = min(v,align_shift_right<16-4>(v,z));
+    v = min(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<16-1>(v,z));
+    v = max(v,align_shift_right<16-2>(v,z));
+    v = max(v,align_shift_right<16-4>(v,z));
+    v = max(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+
+  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<1>(z,v));
+    v = min(v,align_shift_right<2>(z,v));
+    v = min(v,align_shift_right<4>(z,v));
+    v = min(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<1>(z,v));
+    v = max(v,align_shift_right<2>(z,v));
+    v = max(v,align_shift_right<4>(z,v));
+    v = max(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z)
+  {
+    vfloat16 f = zero;
+    f = select(0x1111,vfloat16::broadcast(&x),f);
+    f = select(0x2222,vfloat16::broadcast(&y),f);
+    f = select(0x4444,vfloat16::broadcast(&z),f);
+    return f;
+  }
+
+  __forceinline vfloat16 loadAOS4to16f(unsigned int index,
+                                       const vfloat16& x,
+                                       const vfloat16& y,
+                                       const vfloat16& z)
+  {
+    vfloat16 f = zero;
+    f = select(0x1111,vfloat16::broadcast((float*)&x + index),f);
+    f = select(0x2222,vfloat16::broadcast((float*)&y + index),f);
+    f = select(0x4444,vfloat16::broadcast((float*)&z + index),f);
+    return f;
+  }
+
+  __forceinline vfloat16 loadAOS4to16f(unsigned int index,
+                                       const vfloat16& x,
+                                       const vfloat16& y,
+                                       const vfloat16& z,
+                                       const vfloat16& fill)
+  {
+    vfloat16 f = fill;
+    f = select(0x1111,vfloat16::broadcast((float*)&x + index),f);
+    f = select(0x2222,vfloat16::broadcast((float*)&y + index),f);
+    f = select(0x4444,vfloat16::broadcast((float*)&z + index),f);
+    return f;
+  }
+
+  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h
new file mode 100644
index 0000000000..5732c0fbc8
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h
@@ -0,0 +1,925 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide SSE float type */
+  template<>
+  struct vfloat<4>
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };                        // number of SIMD elements
+    union { __m128 v; float f[4]; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m128 a) : v(a) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator       __m128&()       { return v; }
+
+    __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
+    __forceinline explicit vfloat(const vuint4& x) {
+      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
+      const __m128  af  = _mm_cvtepi32_ps(a);
+      const __m128  bf  = _mm_castsi128_ps(b);
+      v  = _mm_add_ps(af,bf);
+    }
+#endif
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+    static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) {
+      return _mm_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) {
+      return _mm_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+    static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+    return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+    return _mm_load_ps(ptr);
+#endif
+  }
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+      //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const short* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const short* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+    static __forceinline vfloat4 load(const unsigned short* ptr) {
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+    }
+
+    static __forceinline void store_nt(void* ptr, const vfloat4& v)
+    {
+#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v));
+#else
+      _mm_stream_ps((float*)ptr,v);
+#endif
+#else
+      _mm_store_ps((float*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_ps(ptr, index, scale);
+#else
+      return vfloat4(
+        *(float*)(((int8_t*)ptr)+scale*index[0]),
+        *(float*)(((int8_t*)ptr)+scale*index[1]),
+        *(float*)(((int8_t*)ptr)+scale*index[2]),
+        *(float*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+      vfloat4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)  && !defined(__aarch64__)
+      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+      *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 4); return f[index]; }
+
+    friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
+#else
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+    }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+
+  __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
+#if defined(__AVX512VL__)
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); }
+#else
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
+
+  __forceinline vfloat4 rcp(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+#if defined(BUILD_IOS)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else //BUILD_IOS
+    __m128 reciprocal = _mm_rcp_ps(a);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp.
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    return (const vfloat4)reciprocal;
+#endif // BUILD_IOS
+#else
+
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rcp14_ps(a);
+#else
+    const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+  __forceinline vfloat4 rsqrt(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+    const vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_fmadd_ps(_mm_set1_ps(1.5f), r,
+                        _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+                      _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+
+#endif
+  }
+
+  __forceinline vboolf4 isnan(const vfloat4& a) {
+#if defined(__aarch64__)
+    const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff));
+#else
+    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#endif
+#if defined(__AVX512VL__)
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
+  __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
+
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
+  __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
+
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
+  __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
+
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
+  __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
+
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#else
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      return min(a,b);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      return max(a,b);
+    }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_madd_ps(a, b, c);  //a*b+c;
+  }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_msub_ps(a, b, c);  //-a*b+c;
+  }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return vnegq_f32(vfmaq_f32(c,a, b));
+  }
+#else
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+  __forceinline vfloat4& operator +=(vfloat4& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+  __forceinline vfloat4& operator -=(vfloat4& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+  __forceinline vfloat4& operator *=(vfloat4& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+  __forceinline vfloat4& operator /=(vfloat4& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
+  __forceinline vboolf4 operator ==(float          a, const vfloat4& b) { return vfloat4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vfloat4& a, float          b) { return a != vfloat4(b); }
+  __forceinline vboolf4 operator !=(float          a, const vfloat4& b) { return vfloat4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
+  __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
+  __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vfloat4& a, float          b) { return a >  vfloat4(b); }
+  __forceinline vboolf4 operator > (float          a, const vfloat4& b) { return vfloat4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vfloat4& a, float          b) { return a <= vfloat4(b); }
+  __forceinline vboolf4 operator <=(float          a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+    __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+  {
+#if defined(__SSE4_1__)
+    return _mm_blend_ps(f, t, mask);
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+
+#if defined(__aarch64__)
+    template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero));
+    }
+    template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F));
+    }
+    template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0));
+    }
+    template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF));
+    }
+    template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00));
+    }
+    template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F));
+    }
+    template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0));
+    }
+    template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF));
+    }
+    template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000));
+    }
+    template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F));
+    }
+    template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0));
+    }
+    template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF));
+    }
+    template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00));
+    }
+    template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F));
+    }
+    template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0));
+    }
+    template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF));
+    }
+#endif
+
+  __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid(const vfloat4& v) {
+    return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite(const vfloat4& a) {
+    return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+    return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF   ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF   ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO      ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd
+#else
+  __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+  __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+  __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
+    return vint4(floor(a));
+#else
+    return vint4(a-vfloat4(0.5f));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+
+#if defined (__SSSE3__)
+  __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) {
+    return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+  }
+#endif
+
+#if defined(__aarch64__) 
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); }
+#elif defined(__SSE3__)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a);
+  template<> __forceinline float extract<0>(const vfloat4& b) {
+      return b[0];
+  }
+  template<> __forceinline float extract<1>(const vfloat4& b) {
+      return b[1];
+  }
+  template<> __forceinline float extract<2>(const vfloat4& b) {
+      return b[2];
+  }
+  template<> __forceinline float extract<3>(const vfloat4& b) {
+      return b[3];
+  }
+#elif defined (__SSE4_1__) && !defined(__GNUC__)
+  template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
+  template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#else
+  template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
+  template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
+
+
+#if defined(__aarch64__)
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b);
+  template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[0] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[1] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[2] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[3] = b;
+        return c;
+  }
+#elif defined (__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline float toScalar(const vfloat4& v) {
+    return v[0];
+  }
+#else
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+#endif
+  __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) {
+    return vfloat4::broadcast(&a[k]);
+  }
+
+  __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4));
+  }
+
+#if defined (__AVX2__)
+  __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+    return _mm_permutevar_ps(a,index);
+  }
+
+  __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting Network
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 sort_ascending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = min(a0,b0);
+    const vfloat4 d0 = max(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = min(a1,b1);
+    const vfloat4 d1 = max(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = min(a2,b2);
+    const vfloat4 d2 = max(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vfloat4 sort_descending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = max(a0,b0);
+    const vfloat4 d0 = min(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = max(a1,b1);
+    const vfloat4 d1 = min(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = max(a2,b2);
+    const vfloat4 d2 = min(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+      __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
+
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf));
+    const vbool4 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid));
+  }
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf));
+    const vbool4 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+  {
+    const vfloat4 a0 = a;
+    const vfloat4 b0 = shuffle<1,2,0,3>(b);
+    const vfloat4 a1 = shuffle<1,2,0,3>(a);
+    const vfloat4 b1 = b;
+    return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h
new file mode 100644
index 0000000000..3c7e4a8cdc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h
@@ -0,0 +1,847 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX float type */
+  template<>
+  struct vfloat<8>
+  {
+    ALIGNED_STRUCT_(32);
+   
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };                        // number of SIMD elements
+    union { __m256 v; float f[8]; int i[8]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat8& other) { v = other.v; }
+    __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator       __m256&()       { return v; }
+
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+    __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm256_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm256_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm256_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat8 broadcast(const void* a) {
+      return _mm256_broadcast_ss((float*)a); 
+    }
+
+    static __forceinline vfloat8 broadcast2(const float* a, const float* b) {
+#if defined(__INTEL_COMPILER)
+      const vfloat8 v0 = _mm256_broadcast_ss(a); 
+      const vfloat8 v1 = _mm256_broadcast_ss(b); 
+      return _mm256_blend_ps(v1, v0, 0xf);
+#else
+      return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a);
+#endif
+    }
+
+    static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) {
+      return _mm256_broadcast_ps((__m128*)ptr); 
+    }
+
+    static __forceinline vfloat8 load(const int8_t* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const uint8_t* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+      
+    static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+    static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) {
+      return _mm256_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) {
+      return _mm256_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__aarch64__)
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
+#else
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+#endif
+    
+#if defined(__AVX2__)
+    static __forceinline vfloat8 load_nt(void* ptr) {
+      return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+    }
+#endif
+    
+    static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+      _mm256_stream_ps((float*)ptr,v);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+      return vfloat8(
+          *(float*)(((int8_t*)ptr)+scale*index[0]),
+          *(float*)(((int8_t*)ptr)+scale*index[1]),
+          *(float*)(((int8_t*)ptr)+scale*index[2]),
+          *(float*)(((int8_t*)ptr)+scale*index[3]),
+          *(float*)(((int8_t*)ptr)+scale*index[4]),
+          *(float*)(((int8_t*)ptr)+scale*index[5]),
+          *(float*)(((int8_t*)ptr)+scale*index[6]),
+          *(float*)(((int8_t*)ptr)+scale*index[7]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+      vfloat8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    #endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+      *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 8); return f[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+
+  __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+#if !defined(__aarch64__)
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
+    return _mm256_xor_ps(a, mask);
+  }
+#else
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+      __m256 res;
+      res.lo = vnegq_f32(a.v.lo);
+      res.hi = vnegq_f32(a.v.hi);
+      return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+__forceinline vfloat8 abs(const vfloat8& a) {
+  const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+  return _mm256_and_ps(a, mask);
+}
+#else
+__forceinline vfloat8 abs(const vfloat8& a) {
+    __m256 res;
+    res.lo = vabsq_f32(a.v.lo);
+    res.hi = vabsq_f32(a.v.hi);
+    return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#else
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+#endif
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+  static __forceinline vfloat8 rcp(const vfloat8& a)
+  {
+#if defined(BUILD_IOS) && defined(__aarch64__)
+    // ios devices are faster doing full divide, no need for NR fixup
+    vfloat8 ret;
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    ret.v.lo = vdivq_f32(one, a.v.lo);
+    ret.v.hi = vdivq_f32(one, a.v.hi);
+    return ret;
+#endif
+
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+    const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+      
+#if defined(__AVX2__) //&& !defined(aarch64)
+    return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+#else
+    return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+  static __forceinline vfloat8 rsqrt(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+    const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+#else
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
+  __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
+
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
+  __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
+
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
+  __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
+
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
+  __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
+
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+  /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+#else
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    return asFloat(min(asInt(a),asInt(b)));
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    return asFloat(max(asInt(a),asInt(b)));
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+  __forceinline vfloat8& operator +=(vfloat8& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+  __forceinline vfloat8& operator -=(vfloat8& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+  __forceinline vfloat8& operator *=(vfloat8& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+  __forceinline vfloat8& operator /=(vfloat8& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_mask_blend_ps(m, f, t);
+  }
+#elif !defined(__aarch64__)
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m); 
+  }
+#else
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m);
+  }
+
+#endif
+
+  template<int mask>
+    __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+    return _mm256_blend_ps(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
+  __forceinline vboolf8 operator ==(const float&   a, const vfloat8& b) { return vfloat8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vfloat8& a, const float&   b) { return a != vfloat8(b); }
+  __forceinline vboolf8 operator !=(const float&   a, const vfloat8& b) { return vfloat8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vfloat8& a, const float&   b) { return a <  vfloat8(b); }
+  __forceinline vboolf8 operator < (const float&   a, const vfloat8& b) { return vfloat8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vfloat8& a, const float&   b) { return a >= vfloat8(b); }
+  __forceinline vboolf8 operator >=(const float&   a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vfloat8& a, const float&   b) { return a >  vfloat8(b); }
+  __forceinline vboolf8 operator > (const float&   a, const vfloat8& b) { return vfloat8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vfloat8& a, const float&   b) { return a <= vfloat8(b); }
+  __forceinline vboolf8 operator <=(const float&   a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat8& v) {
+    return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite (const vfloat8& a) {
+    return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+    return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+#endif
+
+
+  __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+#if !defined(__aarch64__)
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#endif
+
+  __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+  __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); }
+
+#if defined (__AVX2__) && !defined(__aarch64__)
+  __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+    return _mm256_permutevar8x32_ps(a, index);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+  }  
+#endif
+
+#if defined (__AVX_I__)
+  template<const int mode>
+  static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+    return _mm256_cvtps_ph(a, mode);
+  }
+
+  static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+    return _mm256_cvtph_ps(a);
+  }
+#endif
+
+  __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) {
+    return vfloat4::broadcast(&a[k]);
+  }
+
+  __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) {
+    return vfloat8::broadcast(&a[k]);
+  }
+
+#if defined(__AVX512VL__)
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    return align_shift_right<1>(zero,x);
+  }
+#else
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    const vfloat8 t0 = shuffle<1,2,3,0>(x);
+    const vfloat8 t1 = shuffle4<1,0>(t0);
+    return _mm256_blend_ps(t0,t1,0x88);
+  }
+#endif
+
+  __forceinline vint8 floori(const vfloat8& a) {
+    return vint8(floor(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+  {
+    vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+    vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+    c0 = shuffle4<0,2>(h0,h4);
+    c1 = shuffle4<0,2>(h1,h5);
+    c2 = shuffle4<0,2>(h2,h6);
+    c3 = shuffle4<0,2>(h3,h7);
+    c4 = shuffle4<1,3>(h0,h4);
+    c5 = shuffle4<1,3>(h1,h5);
+    c6 = shuffle4<1,3>(h2,h6);
+    c7 = shuffle4<1,3>(h3,h7);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if !defined(__aarch64__)
+  __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+#else
+  __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
+  __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
+  __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
+  __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
+
+#endif
+  __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
+    const vbool8 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(neg_inf)); 
+    const vbool8 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators (pairs of Vec3fa's)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+  //  return vreduce_add4(a*b);
+  //}
+
+  __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+    return _mm256_dp_ps(a,b,0x7F);
+  }
+
+  __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+  {
+    const vfloat8 a0 = a;
+    const vfloat8 b0 = shuffle<1,2,0,3>(b);
+    const vfloat8 a1 = shuffle<1,2,0,3>(a);
+    const vfloat8 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+  //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+  //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+  //__forceinline float length     (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+  __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+  //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+  //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  //__forceinline float area    (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+  //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+  //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+  //  const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  //}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// In Register Sorting
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 sort_ascending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = min(a0,b0);
+    const vfloat8 d0 = max(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = min(a1,b1);
+    const vfloat8 d1 = max(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = min(a2,b2);
+    const vfloat8 d2 = max(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = min(a3,b3);
+    const vfloat8 d3 = max(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = min(a4,b4);
+    const vfloat8 d4 = max(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = min(a5,b5);
+    const vfloat8 d5 = max(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+   __forceinline vfloat8 sort_descending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = max(a0,b0);
+    const vfloat8 d0 = min(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = max(a1,b1);
+    const vfloat8 d1 = min(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = max(a2,b2);
+    const vfloat8 d2 = min(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = max(a3,b3);
+    const vfloat8 d3 = min(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = max(a4,b4);
+    const vfloat8 d4 = min(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = max(a5,b5);
+    const vfloat8 d5 = min(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h
new file mode 100644
index 0000000000..3249bc2b45
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h
@@ -0,0 +1,490 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 integer type */
+  template<>
+  struct vint<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vint() {}
+    __forceinline vint(const vint16& t) { v = t.v; }
+    __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+    __forceinline vint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vint(int i) {
+      v = _mm512_set1_epi32(i);
+    }
+    
+    __forceinline vint(int a, int b, int c, int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vint(int a0 , int a1 , int a2 , int a3,
+                       int a4 , int a5 , int a6 , int a7,
+                       int a8 , int a9 , int a10, int a11,
+                       int a12, int a13, int a14, int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+    __forceinline vint(const vint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+      v = _mm512_castsi128_si512(a);
+      v = _mm512_inserti32x4(v, b, 1);
+      v = _mm512_inserti32x4(v, c, 2);
+      v = _mm512_inserti32x4(v, d, 3);
+    }
+
+    __forceinline vint(const vint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+
+    __forceinline vint(const vint8& a, const vint8& b) {
+      v = _mm512_castsi256_si512(a);
+      v = _mm512_inserti64x4(v, b, 1);
+    }
+   
+    __forceinline explicit vint(const __m512& f) {
+      v = _mm512_cvtps_epi32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint(ZeroTy)   : v(_mm512_setzero_epi32()) {}
+    __forceinline vint(OneTy)    : v(_mm512_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)   : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+    static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+ 
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) {
+      _mm512_mask_compressstoreu_epi32(addr,mask,reg);
+    }
+
+    static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) {
+      //_mm512_mask_compressstoreu_epi32(addr,mask,reg);
+      *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg)));
+    }
+
+    static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    static __forceinline vint16 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vint16 operator +(const vint16& a) { return a; }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
+  __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
+
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
+  __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
+
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
+  __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
+
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
+  __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
+
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
+  __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
+
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
+  __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
+
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
+  __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
+
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
+  __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
+  
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+  __forceinline vint16& operator +=(vint16& a, int           b) { return a = a + b; }
+  
+  __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+  __forceinline vint16& operator -=(vint16& a, int           b) { return a = a - b; }
+
+  __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+  __forceinline vint16& operator *=(vint16& a, int           b) { return a = a * b; }
+  
+  __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+  __forceinline vint16& operator &=(vint16& a, int           b) { return a = a & b; }
+  
+  __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+  __forceinline vint16& operator |=(vint16& a, int           b) { return a = a | b; }
+  
+  __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+  __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
+  __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
+  __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
+  __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
+  __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
+  __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
+  __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+    
+ 
+  __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) {
+    const vint16 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) {
+    return _mm512_mask_test_epi32_mask(m,a,b);
+  }
+
+  __forceinline vboolf16 test(const vint16& a, const vint16& b) {
+    return _mm512_test_epi32_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+  template<int i>
+    __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline int toScalar(const vint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+  __forceinline size_t extract64bit(const vint16& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  template<int N, int i>
+  vint<N> extractN(const vint16& v);
+
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 vreduce_min2(vint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_max2(vint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_and2(vint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_or2(vint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_add2(vint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+  
+  __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 conflict(const vint16& index)
+  {
+    return _mm512_conflict_epi32(index);
+  }
+
+  __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+  {
+    return _mm512_mask_conflict_epi32(dest,mask,index);
+  }    
+
+  __forceinline vint16 convert_uint32_t(const __m512& f) {
+    return _mm512_cvtps_epu32(f);
+  }
+
+  __forceinline vint16 permute(vint16 v, vint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vint16 reverse(const vint16 &a) {
+    return permute(a,vint16(reverse_step));
+  }
+
+  __forceinline vint16 prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vint16 reverse_prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  /* this should use a vbool8 and a vint8_64...*/
+  template<int scale = 1, int hint = _MM_HINT_T0>
+    __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+  {
+#if defined(__AVX512PF__)
+    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h
new file mode 100644
index 0000000000..96f105a7c5
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h
@@ -0,0 +1,681 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };             // number of SIMD elements
+    union { __m128i v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint4& a) { v = a.v; }
+    __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+    __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+    __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm_setzero_si128()) {}
+    __forceinline vint(OneTy)         : v(_mm_set_epi32(1, 1, 1, 1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+    __forceinline vint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+      return _mm_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+      return _mm_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__aarch64__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+        return  _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+#else
+
+    static __forceinline vint4 load(const uint8_t* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+#endif
+
+    static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return __m128i(vmovl_u16(vld1_u16(ptr)));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint4& v) {
+#if defined(__aarch64__)
+        int32x4_t x = v;
+        uint16x4_t y = vqmovn_u32(uint32x4_t(x));
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (uint8_t)v[i];
+#endif
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint4& v) {
+#if defined(__aarch64__)
+      uint32x4_t x = uint32x4_t(v.v);
+      uint16x4_t y = vqmovn_u32(x);
+      vst1_u16(ptr, y);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+#endif
+    }
+
+    static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr);
+#else
+      return _mm_load_si128((__m128i*)ptr);
+#endif
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32(ptr, index, scale);
+#else
+      return vint4(
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+      vint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+      *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+#if defined(__x86_64__) || defined(__aarch64__)
+    static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__aarch64__)
+      return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vint4 operator +(const vint4& a) { return a; }
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__aarch64__)
+  __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
+#elif defined(__SSSE3__)
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
+  __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
+
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
+  __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+  __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
+  __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
+
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
+  __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
+
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
+  __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
+
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
+  __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
+
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
+
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+  __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
+
+  __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+  __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+  __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
+#endif
+
+  __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+  __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
+
+  __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+  __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
+
+  __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+  __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vint4& a, int          b) { return a == vint4(b); }
+  __forceinline vboolf4 operator ==(int          a, const vint4& b) { return vint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vint4& a, int          b) { return a != vint4(b); }
+  __forceinline vboolf4 operator !=(int          a, const vint4& b) { return vint4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vint4& a, int          b) { return a <  vint4(b); }
+  __forceinline vboolf4 operator < (int          a, const vint4& b) { return vint4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vint4& a, int          b) { return a >= vint4(b); }
+  __forceinline vboolf4 operator >=(int          a, const vint4& b) { return vint4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vint4& a, int          b) { return a >  vint4(b); }
+  __forceinline vboolf4 operator > (int          a, const vint4& b) { return vint4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vint4& a, int          b) { return a <= vint4(b); }
+  __forceinline vboolf4 operator <=(int          a, const vint4& b) { return vint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+
+      
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vint4 min(const vint4& a, int          b) { return min(a,vint4(b)); }
+  __forceinline vint4 min(int          a, const vint4& b) { return min(vint4(a),b); }
+  __forceinline vint4 max(const vint4& a, int          b) { return max(a,vint4(b)); }
+  __forceinline vint4 max(int          a, const vint4& b) { return max(vint4(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& v) {
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+    }
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+        return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+    }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+    template<int src> __forceinline int extract(const vint4& b);
+    template<int dst> __forceinline vint4 insert(const vint4& a, const int b);
+#elif defined(__SSE4_1__)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+    template<> __forceinline int extract<0>(const vint4& b) {
+        return b.v[0];
+    }
+    template<> __forceinline int extract<1>(const vint4& b) {
+        return b.v[1];
+    }
+    template<> __forceinline int extract<2>(const vint4& b) {
+        return b.v[2];
+    }
+    template<> __forceinline int extract<3>(const vint4& b) {
+        return b.v[3];
+    }
+    template<> __forceinline vint4 insert<0>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[0] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<1>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[1] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<2>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[2] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<3>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[3] = b;
+        return c;
+    }
+      
+    __forceinline int toScalar(const vint4& v) {
+        return v[0];
+    }
+      
+    __forceinline size_t toSizeT(const vint4& v) {
+        uint64x2_t x = uint64x2_t(v.v);
+        return x[0];
+    }
+#else
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+
+  __forceinline size_t toSizeT(const vint4& v) {
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+    return toScalar(v);
+#elif defined(__ARM_NEON)
+    // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
+    return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
+#else
+    return _mm_cvtsi128_si64(v);
+#endif
+  }
+#endif
+      
+#if defined(__AVX512VL__)
+
+  __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+  }
+
+  template<int i>
+  __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+    return _mm_alignr_epi32(a, b, i);
+  }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      
+#if defined(__aarch64__)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+      
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+#else
+  __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+#endif
+      
+  __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umin(a0,b0);
+    const vint4 d0 = umax(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umin(a1,b1);
+    const vint4 d1 = umax(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umin(a2,b2);
+    const vint4 d2 = umax(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umax(a0,b0);
+    const vint4 d0 = umin(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umax(a1,b1);
+    const vint4 d1 = umin(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umax(a2,b2);
+    const vint4 d2 = umin(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+#else
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = min(a0,b0);
+    const vint4 d0 = max(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = min(a1,b1);
+    const vint4 d1 = max(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = min(a2,b2);
+    const vint4 d2 = max(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = max(a0,b0);
+    const vint4 d0 = min(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = max(a1,b1);
+    const vint4 d1 = min(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = max(a2,b2);
+    const vint4 d2 = min(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h
new file mode 100644
index 0000000000..25a771284d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint8_avx.h
@@ -0,0 +1,464 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+#if !defined(__aarch64__)
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vint8 load(const uint8_t* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const uint8_t* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 load(const unsigned short* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned short* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint8& i) {
+      vint4 il(i.vl);
+      vint4 ih(i.vh);
+      vint4::store(ptr + 0,il);
+      vint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+      return vint8(
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]),
+          *(int*)(((int8_t*)ptr)+scale*index[4]),
+          *(int*)(((int8_t*)ptr)+scale*index[5]),
+          *(int*)(((int8_t*)ptr)+scale*index[6]),
+          *(int*)(((int8_t*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+      vint8 r = zero;
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+  __forceinline vint8 abs       (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+  __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umin(const vint8& a, int          b) { return umin(a,vint8(b)); }
+  __forceinline vint8 umin(int          a, const vint8& b) { return umin(vint8(a),b); }
+
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umax(const vint8& a, int          b) { return umax(a,vint8(b)); }
+  __forceinline vint8 umax(int          a, const vint8& b) { return umax(vint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+  
+  __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+  
+  __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+  __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  __forceinline vint8 notand(const vboolf8& m, const vint8& f) {
+    return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h
new file mode 100644
index 0000000000..4937d972cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h
@@ -0,0 +1,512 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32(ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+      vint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vint8 select(const vint8& t, const vint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+
+__forceinline vint8 permute(const vint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+
+
+  template<int i>
+  static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+
+  __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h
new file mode 100644
index 0000000000..de3ebc16a7
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 4-wide AVX2 64-bit long long type */
+  template<>
+  struct vllong<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256i v; 
+      long long i[4];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong4& t) { v = t.v; }
+    __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m256i& t) { v = t; }
+    __forceinline operator __m256i() const { return v; }
+    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+    __forceinline vllong(long long i) {
+      v = _mm256_set1_epi64x(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm256_set_epi64x(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+    __forceinline vllong(OneTy)  : v(_mm256_set1_epi64x(1)) {}
+    __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+    }
+
+    static __forceinline vllong4 loadu(const void* addr)
+    {
+      return _mm256_loadu_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const vllong4* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const long long* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline void store(void* ptr, const vllong4& v) {
+      _mm256_store_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong4& v) {
+      _mm256_storeu_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_store_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline vllong4 broadcast64bit(size_t v) {
+      return _mm256_set1_epi64x(v);
+    }
+
+    static __forceinline size_t extract64bit(const vllong4& v)
+    {
+      return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+  #if defined(__AVX512VL__)
+    return _mm256_mask_blend_epi64(m, f, t);
+  #else
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+  #endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+  __forceinline vllong4 operator +(const vllong4& a) { return a; }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
+  __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
+
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
+  __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
+
+  /* only low 32bit part */
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
+  __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
+
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
+  __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
+
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
+  __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
+
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
+  __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
+
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+  //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  
+  //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+  //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
+  //__forceinline vllong4 min(long long      a, const vllong4& b) { return min(vllong4(a),b); }
+
+  //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+  //__forceinline vllong4 max(const vllong4& a, long long      b) { return max(a,vllong4(b)); }
+  //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+  __forceinline vllong4& operator +=(vllong4& a, long long      b) { return a = a + b; }
+  
+  __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+  __forceinline vllong4& operator -=(vllong4& a, long long      b) { return a = a - b; }
+
+  __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+  __forceinline vllong4& operator *=(vllong4& a, long long      b) { return a = a * b; }
+  
+  __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+  __forceinline vllong4& operator &=(vllong4& a, long long      b) { return a = a & b; }
+  
+  __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+  __forceinline vllong4& operator |=(vllong4& a, long long      b) { return a = a | b; }
+  
+  __forceinline vllong4& operator <<=(vllong4& a, long long      b) { return a = a << b; }
+  //__forceinline vllong4& operator >>=(vllong4& a, long long      b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+  __forceinline vboold4 operator ==(const vllong4& a, long long      b) { return a == vllong4(b); }
+  __forceinline vboold4 operator ==(long long      a, const vllong4& b) { return vllong4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vllong4& a, long long      b) { return a != vllong4(b); }
+  __forceinline vboold4 operator !=(long long      a, const vllong4& b) { return vllong4(a) != b; }
+
+  __forceinline vboold4 operator > (const vllong4& a, long long      b) { return a >  vllong4(b); }
+  __forceinline vboold4 operator > (long long      a, const vllong4& b) { return vllong4(a) >  b; }
+
+  __forceinline vboold4 operator < (const vllong4& a, long long      b) { return a <  vllong4(b); }
+  __forceinline vboold4 operator < (long long      a, const vllong4& b) { return vllong4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vllong4& a, long long      b) { return a >= vllong4(b); }
+  __forceinline vboold4 operator >=(long long      a, const vllong4& b) { return vllong4(a) >= b; }
+
+  __forceinline vboold4 operator <=(const vllong4& a, long long      b) { return a <= vllong4(b); }
+  __forceinline vboold4 operator <=(long long      a, const vllong4& b) { return vllong4(a) <= b; }
+
+  __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+  __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+  __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a >  b; }
+  __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) {
+    const vllong4 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold4 test(const vllong4& a, const vllong4& b) {
+#if defined(__AVX512VL__)
+    return _mm256_test_epi64_mask(a,b);
+#else
+    return _mm256_testz_si256(a,b);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle2(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+  }
+
+  __forceinline long long toScalar(const vllong4& v) {
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+  }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+    // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return _mm256_permutex2var_epi64(a,index,a);
+#else
+    return _mm256_permutexvar_epi64(index,a);
+#endif
+  }
+
+  __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+    return _mm256_permutex2var_epi64(a,index,b);
+  }
+
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+
+  __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+  __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h
new file mode 100644
index 0000000000..76dddd8991
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h
@@ -0,0 +1,381 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 8-wide AVX-512 64-bit long long type */
+  template<>
+  struct vllong<8>
+  {
+    ALIGNED_STRUCT_(64);
+        
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m512i v; 
+      long long i[8];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong8& t) { v = t.v; }
+    __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vllong(long long i) {
+      v = _mm512_set1_epi64(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm512_set4_epi64(d,c,b,a);      
+    }
+
+    __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+                         long long a4, long long a5, long long a6, long long a7)
+    {
+      v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline vllong(const vllong<4>& i) {
+      v = _mm512_broadcast_i64x4(i);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vllong(OneTy)  : v(_mm512_set1_epi64(1)) {}
+    __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vllong8 loadu(const void* addr) {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const vllong8* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const long long* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const uint8_t* ptr) {
+      return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
+    }
+
+    static __forceinline void store(void* ptr, const vllong8& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong8& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+      _mm512_mask_storeu_epi64(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+      _mm512_mask_store_epi64(addr,mask,v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) {
+      _mm512_mask_compressstoreu_epi64(addr,mask,reg);
+    }
+
+    static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) {
+      return _mm512_mask_compress_epi64(dest,mask,source);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_compress_epi64(a,mask,b);
+    }
+
+    static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_expand_epi64(b,mask,a);
+    }
+
+    static __forceinline vllong8 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    static __forceinline size_t extract64bit(const vllong8& v)
+    {
+      return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+  __forceinline vllong8 operator +(const vllong8& a) { return a; }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
+  __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
+
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
+  __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
+
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
+  __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
+
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
+  __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
+
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
+  __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
+
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
+  __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
+
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
+  __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
+
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
+  __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
+  
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8& operator +=(vllong8& a, const vllong8&  b) { return a = a + b; }
+  __forceinline vllong8& operator +=(vllong8& a, long long       b) { return a = a + b; }
+  
+  __forceinline vllong8& operator -=(vllong8& a, const vllong8&  b) { return a = a - b; }
+  __forceinline vllong8& operator -=(vllong8& a, long long       b) { return a = a - b; }
+
+  __forceinline vllong8& operator *=(vllong8& a, const vllong8&  b) { return a = a * b; }
+  __forceinline vllong8& operator *=(vllong8& a, long long       b) { return a = a * b; }
+  
+  __forceinline vllong8& operator &=(vllong8& a, const vllong8&  b) { return a = a & b; }
+  __forceinline vllong8& operator &=(vllong8& a, long long       b) { return a = a & b; }
+  
+  __forceinline vllong8& operator |=(vllong8& a, const vllong8&  b) { return a = a | b; }
+  __forceinline vllong8& operator |=(vllong8& a, long long       b) { return a = a | b; }
+  
+  __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+  __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
+  __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
+  
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
+  __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
+  
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
+  __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
+  
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
+  __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
+  __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
+  __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
+
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+    
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+    return _mm512_mask_or_epi64(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) {
+    const vllong8 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) {
+    return _mm512_mask_test_epi64_mask(m,a,b);
+  }
+
+  __forceinline vboold8 test(const vllong8& a, const vllong8& b) {
+    return _mm512_test_epi64_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return shuffle4<i, i>(v);
+  }
+
+  template<int i>
+  __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+    return _mm512_alignr_epi64(a, b, i);
+  };
+
+  __forceinline long long toScalar(const vllong8& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  __forceinline vllong8 zeroExtend32Bit(const __m512i& a) {
+    return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 vreduce_min2(vllong8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_max2(vllong8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_and2(vllong8 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_or2(vllong8 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_add2(vllong8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+  __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+  __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+    return _mm512_permutexvar_epi64(index,v);  
+  }
+
+  __forceinline vllong8 reverse(const vllong8& a) {
+    return permute(a,vllong8(reverse_step));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h
new file mode 100644
index 0000000000..39752611bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 unsigned integer type */
+  template<>
+  struct vuint<16>
+  {
+    ALIGNED_STRUCT_(64);   
+
+    typedef vboolf16 Bool;
+    typedef vuint16  UInt;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      unsigned int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint16& t) { v = t.v; }
+    __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+    __forceinline vuint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vuint(unsigned int i) {
+      v = _mm512_set1_epi32(i);
+    }
+
+    __forceinline vuint(const vuint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vuint(const vuint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+    
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+                        unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+                        unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+                        unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline explicit vuint(const __m512& f) {
+      v = _mm512_cvtps_epu32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vuint(OneTy)  : v(_mm512_set1_epi32(1)) {}
+    __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vuint16 loadu(const void* addr)
+    {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vuint16 load(const vuint16* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(const unsigned int* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+    static __forceinline void store(void* ptr, const vuint16& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vuint16& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+      _mm512_mask_storeu_epi32(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+      _mm512_mask_store_epi32(addr,mask,v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) {
+      _mm512_mask_compressstoreu_epi32(addr,mask,reg);
+    }
+
+    static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) {
+      //_mm512_mask_compressstoreu_epi32(addr,mask,reg);
+      *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg)));
+    }
+
+    static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    static __forceinline vuint16 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    static __forceinline size_t extract64bit(const vuint16& v)
+    {
+      return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vuint16 operator +(const vuint16& a) { return a; }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
+  __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
+
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
+  __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
+
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
+  __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
+
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
+  __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
+
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
+  __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
+
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
+  __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
+
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
+  __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
+
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
+  __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
+  
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+  __forceinline vuint16& operator +=(vuint16& a, unsigned int   b) { return a = a + b; }
+  
+  __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+  __forceinline vuint16& operator -=(vuint16& a, unsigned int   b) { return a = a - b; }
+
+  __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+  __forceinline vuint16& operator *=(vuint16& a, unsigned int   b) { return a = a * b; }
+  
+  __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+  __forceinline vuint16& operator &=(vuint16& a, unsigned int   b) { return a = a & b; }
+  
+  __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+  __forceinline vuint16& operator |=(vuint16& a, unsigned int   b) { return a = a | b; }
+  
+  __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
+  __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
+  __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
+  __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
+  __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
+  __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
+  __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+    
+ 
+  __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) {
+    const vuint16 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) {
+    return _mm512_mask_test_epi32_mask(m,a,b);
+  }
+
+  __forceinline vboolf16 test(const vuint16& a, const vuint16& b) {
+    return _mm512_test_epi32_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline unsigned int toScalar(const vuint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 vreduce_min2(vuint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_max2(vuint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_and2(vuint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_or2(vuint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_add2(vuint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vuint16 reverse(const vuint16& a) {
+    return permute(a,vuint16(reverse_step));
+  }
+
+  __forceinline vuint16 prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vuint16 reverse_prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h
new file mode 100644
index 0000000000..a3f393ebf2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h
@@ -0,0 +1,499 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vuint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vuint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union { __m128i v; unsigned int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint4& a) { v = a.v; }
+    __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+    __forceinline vuint(const __m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+
+    __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm_setzero_si128()) {}
+    __forceinline vuint(OneTy)    : v(_mm_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+    __forceinline vuint(StepTy)   : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vuint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+#endif
+
+    static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return _mm_load4epu16_epi32(((__m128i*)ptr));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) {
+#if defined(__aarch64__) 
+        uint32x4_t x = uint32x4_t(v.v);
+        uint16x4_t y = vqmovn_u32(x);
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(unsigned*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (uint8_t)v[i];
+#endif
+    }
+
+    static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) {
+#if defined(__aarch64__)
+        uint32x4_t x = (uint32x4_t)v.v;
+        uint16x4_t y = vqmovn_u32(x);
+        vst1_u16(ptr, y);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+#endif
+    }
+
+    static __forceinline vuint4 load_nt(void* ptr) {
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); 
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+      return vuint4(
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+      vuint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vuint4 operator +(const vuint4& a) { return a; }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
+  __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
+
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
+  __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+//  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
+//  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
+
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
+  __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
+
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
+  __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
+
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
+
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+  __forceinline vuint4& operator +=(vuint4& a, unsigned int  b) { return a = a + b; }
+  
+  __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+  __forceinline vuint4& operator -=(vuint4& a, unsigned int  b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+//  __forceinline vuint4& operator *=(vuint4& a, unsigned int  b) { return a = a * b; }
+//#endif
+  
+  __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+  __forceinline vuint4& operator &=(vuint4& a, unsigned int  b) { return a = a & b; }
+  
+  __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+  __forceinline vuint4& operator |=(vuint4& a, unsigned int  b) { return a = a | b; }
+  
+  __forceinline vuint4& operator <<=(vuint4& a, unsigned int  b) { return a = a << b; }
+  __forceinline vuint4& operator >>=(vuint4& a, unsigned int  b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vuint4& a, unsigned int  b) { return a == vuint4(b); }
+  __forceinline vboolf4 operator ==(unsigned int  a, const vuint4& b) { return vuint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vuint4& a, unsigned int  b) { return a != vuint4(b); }
+  __forceinline vboolf4 operator !=(unsigned int  a, const vuint4& b) { return vuint4(a) != b; }
+
+  //__forceinline vboolf4 operator < (const vuint4& a, unsigned int  b) { return a <  vuint4(b); }
+  //__forceinline vboolf4 operator < (unsigned int  a, const vuint4& b) { return vuint4(a) <  b; }
+
+  //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int  b) { return a >= vuint4(b); }
+  //__forceinline vboolf4 operator >=(unsigned int  a, const vuint4& b) { return vuint4(a) >= b; }
+
+  //__forceinline vboolf4 operator > (const vuint4& a, unsigned int  b) { return a >  vuint4(b); }
+  //__forceinline vboolf4 operator > (unsigned int  a, const vuint4& b) { return vuint4(a) >  b; }
+
+  //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int  b) { return a <= vuint4(b); }
+  //__forceinline vboolf4 operator <=(unsigned int  a, const vuint4& b) { return vuint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+  //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a <  b; }
+  //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+  //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a >  b; }
+  //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <  b); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >  b); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+/*#if defined(__SSE4_1__)
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vuint4 min(const vuint4& a, unsigned int  b) { return min(a,vuint4(b)); }
+  __forceinline vuint4 min(unsigned int  a, const vuint4& b) { return min(vuint4(a),b); }
+  __forceinline vuint4 max(const vuint4& a, unsigned int  b) { return max(a,vuint4(b)); }
+  __forceinline vuint4 max(unsigned int  a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b);
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b);
+#elif defined(__SSE4_1__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) {
+    return b[0];
+  }
+  template<> __forceinline unsigned int extract<1>(const vuint4& b) {
+    return b[1];
+  }
+  template<> __forceinline unsigned int extract<2>(const vuint4& b) {
+    return b[2];
+  }
+  template<> __forceinline unsigned int extract<3>(const vuint4& b) {
+    return b[3];
+  }
+                                                                               
+  template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[0] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[1] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[2] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[3] = b;
+    return c;
+  }
+                                                                               
+  __forceinline unsigned int toScalar(const vuint4& v) {
+    return v[0];
+  }
+#else
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+#endif
+                                                                               
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+  __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h
new file mode 100644
index 0000000000..d4e86ae92d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h
@@ -0,0 +1,379 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);   
+
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+#if !defined(__aarch64__)
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vuint8 load(const uint8_t* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const uint8_t* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 load(const unsigned short* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned short* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vuint8& i) {
+      vuint4 il(i.vl);
+      vuint4 ih(i.vh);
+      vuint4::store(ptr + 0,il);
+      vuint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+      return vuint8(
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[4]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[5]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[6]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+      vuint8 r = zero;
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+  __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                       _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+  
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+  
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h
new file mode 100644
index 0000000000..b2a965448d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h
@@ -0,0 +1,439 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);
+        
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vuint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vuint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+      vuint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+  //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a <  b; }
+  //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+  //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a >  b; }
+  //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+  //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <  b); }
+  //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+  //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >  b); }
+  //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+
+  __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }
+
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp
new file mode 100644
index 0000000000..12f143f131
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp
@@ -0,0 +1,327 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+namespace embree
+{
+  void* alignedMalloc(size_t size, size_t align) 
+  {
+    if (size == 0)
+      return nullptr;
+    
+    assert((align & (align-1)) == 0);
+    void* ptr = _mm_malloc(size,align);
+
+    if (size != 0 && ptr == nullptr)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort(); 
+      // -- GODOT end --
+    
+    return ptr;
+  }
+  
+  void alignedFree(void* ptr)
+  {
+    if (ptr)
+      _mm_free(ptr);
+  }
+
+  static bool huge_pages_enabled = false;
+  static MutexSys os_init_mutex;
+
+  __forceinline bool isHugePageCandidate(const size_t bytes) 
+  {
+    if (!huge_pages_enabled)
+      return false;
+
+    /* use huge pages only when memory overhead is low */
+    const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+    return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+  bool win_enable_selockmemoryprivilege (bool verbose)
+  {
+    HANDLE hToken;
+    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+      if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+
+    TOKEN_PRIVILEGES tp;
+    tp.PrivilegeCount = 1;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+    if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+      if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+    
+    SetLastError(ERROR_SUCCESS);
+    if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+      return false;
+    }
+    
+    if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+      return false;
+    } 
+
+    return true;
+  }
+
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+    if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+      huge_pages_enabled = false;
+      return false;
+    }
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  {
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+      int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+      if (ptr != nullptr) {
+        hugepages = true;
+        return ptr;
+      }
+    } 
+
+    /* fall back to 4k pages */
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+    // -- GODOT start --
+    // if (ptr == nullptr) throw std::bad_alloc();
+    if (ptr == nullptr) abort();
+    // -- GODOT end --
+    hugepages = false;
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    if (hugepages) // decommitting huge pages seems not to work under Windows
+      return bytesOld;
+
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0) 
+      return;
+
+    if (!VirtualFree(ptr,0,MEM_RELEASE))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  void os_advise(void *ptr, size_t bytes)
+  {
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+#if defined(__LINUX__)
+
+    int hugepagesize = 0;
+
+    std::ifstream file; 
+    file.open("/proc/meminfo",std::ios::in);
+    if (!file.is_open()) {
+      if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+    
+    std::string line;
+    while (getline(file,line))
+    {
+      std::stringstream sline(line);
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string tag; getline(sline,tag,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string val; getline(sline,val,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string unit; getline(sline,unit,' ');
+      if (tag == "Hugepagesize:" && unit == "kB") {
+	hugepagesize = std::stoi(val)*1024;
+	break;
+      }
+    }
+    
+    if (hugepagesize != PAGE_SIZE_2M) 
+    {
+      if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+#endif
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  { 
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+#if defined(__MACOSX__)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#elif defined(MAP_HUGETLB)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#endif
+    } 
+
+    /* fallback to 4k pages */
+    void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    // -- GODOT start --
+    // if (ptr == MAP_FAILED) throw std::bad_alloc();
+    if (ptr == MAP_FAILED) abort();
+    // -- GODOT end --
+    hugepages = false;
+
+    /* advise huge page hint for THP */
+    os_advise(ptr,bytes);
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0)
+      return;
+
+    /* for hugepages we need to also align the size */
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytes = (bytes+pageSize-1) & ~(pageSize-1);
+    if (munmap(ptr,bytes) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  /* hint for transparent huge pages (THP) */
+  void os_advise(void* pptr, size_t bytes)
+  {
+#if defined(MADV_HUGEPAGE)
+    madvise(pptr,bytes,MADV_HUGEPAGE); 
+#endif
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h
new file mode 100644
index 0000000000..5898ecda70
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/alloc.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align)                                           \
+  void* operator new(size_t size) { return alignedMalloc(size,align); } \
+  void operator delete(void* ptr) { alignedFree(ptr); }                 \
+  void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedFree(ptr); }
+
+#define ALIGNED_CLASS_(align)                                           \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                              \
+ private:
+  
+  /*! aligned allocation */
+  void* alignedMalloc(size_t size, size_t align);
+  void alignedFree(void* ptr);
+  
+  /*! allocator that performs aligned allocations */
+  template<typename T, size_t alignment>
+    struct aligned_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return alignedFree(p);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocates pages directly from OS */
+  bool win_enable_selockmemoryprivilege(bool verbose);
+  bool os_init(bool hugepages, bool verbose);
+  void* os_malloc (size_t bytes, bool& hugepages);
+  size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+  void  os_free   (void* ptr, size_t bytes, bool hugepages);
+  void  os_advise (void* ptr, size_t bytes);
+
+  /*! allocator that performs OS allocations */
+  template<typename T>
+    struct os_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline os_allocator () 
+        : hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return os_free(p,n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+      bool hugepages;
+    };
+
+  /*! allocator for IDs */
+  template<typename T, size_t max_id>
+    struct IDPool
+    {
+      typedef T value_type;
+
+      IDPool ()
+      : nextID(0) {}
+
+      T allocate() 
+      {
+        /* return ID from list */
+        if (!IDs.empty()) 
+        {
+          T id = *IDs.begin();
+          IDs.erase(IDs.begin());
+          return id;
+        } 
+
+        /* allocate new ID */
+        else
+        {
+          if (size_t(nextID)+1 > max_id)
+            return -1;
+          
+          return nextID++;
+        }
+      }
+
+      /* adds an ID provided by the user */
+      bool add(T id)
+      {
+        if (id > max_id)
+          return false;
+        
+        /* check if ID should be in IDs set */
+        if (id < nextID) {
+          auto p = IDs.find(id);
+          if (p == IDs.end()) return false;
+          IDs.erase(p);
+          return true;
+        }
+
+        /* otherwise increase ID set */
+        else
+        {
+          for (T i=nextID; i<id; i++) {
+            IDs.insert(i);
+          }
+          nextID = id+1;
+          return true;
+        }
+      }
+
+      void deallocate( T id ) 
+      {
+        assert(id < nextID);
+        MAYBE_UNUSED auto done = IDs.insert(id).second;
+        assert(done);
+      }
+
+    private:
+      std::set<T> IDs;   //!< stores deallocated IDs to be reused
+      T nextID;          //!< next ID to use when IDs vector is empty
+    };
+}
+
diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h
new file mode 100644
index 0000000000..77722a39f6
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+  /*! static array with static size */
+  template<typename T, size_t N>
+    class array_t
+    {
+    public:
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+N; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return N == 0; }
+      __forceinline size_t size     () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+            
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline T& front() const { assert(N > 0); return items[0]; };
+      __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      T items[N];
+    };
+
+  /*! static array with dynamic size */
+  template<typename T, size_t N>
+    class darray_t
+    {
+    public:
+
+      __forceinline darray_t () : M(0) {}
+
+      __forceinline darray_t (const T& v) : M(0) {
+        for (size_t i=0; i<N; i++) items[i] = v;
+      }
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+M; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return M == 0; }
+      __forceinline size_t size     () const { return M; }
+      __forceinline size_t capacity () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+      
+      void resize(size_t new_size) {
+        assert(new_size < max_size());
+        M = new_size;
+      }
+
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& v) 
+      {
+        assert(M+1 < max_size());
+        items[M++] = v;
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        M--;
+      }
+
+      __forceinline void clear() {
+        M = 0;
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline T& front() const { assert(M > 0); return items[0]; };
+      __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      size_t M;
+      T items[N];
+    };
+
+  /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+  template<typename Ty, size_t max_stack_bytes>
+    struct __aligned(64) StackArray
+  {
+    __forceinline StackArray (const size_t N)
+      : N(N)
+    {
+      if (N*sizeof(Ty) <= max_stack_bytes) 
+        data = &arr[0];
+      else
+        data = (Ty*) alignedMalloc(N*sizeof(Ty),64); 
+    }
+
+    __forceinline ~StackArray () {
+      if (data != &arr[0]) alignedFree(data);
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)       { assert(i>=0 && i<N); return data[i]; }
+    __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+    __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+    __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+  private:
+    Ty arr[max_stack_bytes/sizeof(Ty)];
+    Ty* data;
+    size_t N;
+
+  private:
+    StackArray (const StackArray& other) DELETED; // do not implement
+    StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+  };
+
+  /*! dynamic sized array that is allocated on the stack */
+  template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+    struct __aligned(64) DynamicStackArray
+  {
+    __forceinline DynamicStackArray ()
+      : data(&arr[0]) {}
+
+    __forceinline ~DynamicStackArray ()
+    {
+      if (!isStackAllocated())
+        delete[] data;
+    }
+
+    __forceinline bool isStackAllocated() const {
+      return data == &arr[0];
+    }
+
+    __forceinline size_t size() const
+    {
+      if (isStackAllocated()) return max_stack_elements;
+      else return max_total_elements;
+    }
+
+    __forceinline void resize(size_t M)
+    {
+      assert(M <= max_total_elements);
+      if (likely(M <= max_stack_elements)) return;
+      if (likely(!isStackAllocated())) return;
+
+      data = new Ty[max_total_elements];
+      
+      for (size_t i=0; i<max_stack_elements; i++)
+        data[i] = arr[i];
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+    __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+    __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+    __forceinline DynamicStackArray (const DynamicStackArray& other)
+      : data(&arr[0]) 
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+    }
+     
+    DynamicStackArray& operator= (const DynamicStackArray& other)
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+
+      return *this;
+    }
+
+  private:
+    Ty arr[max_stack_elements];
+    Ty* data;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h
new file mode 100644
index 0000000000..ebfb8552c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+#  define __memory_barrier() asm volatile("" ::: "memory")
+#elif  defined(_MSC_VER)
+#  define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+  template <typename T>
+    struct atomic : public std::atomic<T>
+  {
+    atomic () {}
+      
+    atomic (const T& a)
+      : std::atomic<T>(a) {}
+
+    atomic (const atomic<T>& a) {
+      this->store(a.load());
+    }
+
+    atomic& operator=(const atomic<T>& other) {
+      this->store(other.load());
+      return *this;
+    }
+  };
+
+  template<typename T>
+    __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a <= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+
+  template<typename T>
+    __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a >= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp
new file mode 100644
index 0000000000..0061d18db2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : i(0), enterCount(0), exitCount(0), barrierSize(0) 
+    {
+      events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      init(N);
+    }
+    
+    __forceinline ~BarrierSysImplementation ()
+    {
+      CloseHandle(events[0]);
+      CloseHandle(events[1]);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      barrierSize = N;
+      enterCount.store(N);
+      exitCount.store(N);
+    }
+
+    __forceinline void wait()
+    {
+      /* every thread entering the barrier decrements this count */
+      size_t i0 = i;
+      size_t cnt0 = enterCount--;
+
+      /* all threads except the last one are wait in the barrier */
+      if (cnt0 > 1) 
+      {
+        if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+          THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+      }
+      
+      /* the last thread starts all threads waiting at the barrier */
+      else 
+      {
+        i = 1-i;
+        enterCount.store(barrierSize);
+        if (SetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("SetEvent failed");
+      }
+
+      /* every thread leaving the barrier decrements this count */
+      size_t cnt1 = exitCount--;
+
+      /* the last thread that left the barrier resets the event again */
+      if (cnt1 == 1) 
+      {
+        exitCount.store(barrierSize);
+        if (ResetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("ResetEvent failed");
+      }
+    }
+
+  public:
+    HANDLE events[2];
+    atomic<size_t> i;
+    atomic<size_t> enterCount;
+    atomic<size_t> exitCount;
+    size_t barrierSize;
+  };
+}
+
+#else
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : count(0), barrierSize(0) 
+    {
+      init(N);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      assert(count == 0);
+      count = 0;
+      barrierSize = N;
+    }
+
+    __forceinline void wait()
+    {
+      mutex.lock();
+      count++;
+      
+      if (count == barrierSize) {
+        count = 0;
+        cond.notify_all();
+        mutex.unlock();
+        return;
+      }
+      
+      cond.wait(mutex);
+      mutex.unlock();
+      return;
+    }
+
+  public:
+    MutexSys mutex;
+    ConditionSys cond;
+    volatile size_t count;
+    volatile size_t barrierSize;
+  };
+}
+
+#endif
+
+namespace embree
+{
+  BarrierSys::BarrierSys (size_t N) {
+    opaque = new BarrierSysImplementation(N);
+  }
+
+  BarrierSys::~BarrierSys () {
+    delete (BarrierSysImplementation*) opaque;
+  }
+
+  void BarrierSys::init(size_t count) {
+    ((BarrierSysImplementation*) opaque)->init(count);
+  }
+
+  void BarrierSys::wait() {
+    ((BarrierSysImplementation*) opaque)->wait();
+  }
+
+  LinearBarrierActive::LinearBarrierActive (size_t N) 
+    : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+  { 
+    if (N == 0) N = getNumberOfLogicalThreads();
+    init(N);
+  }
+
+  LinearBarrierActive::~LinearBarrierActive() 
+  {
+    delete[] count0;
+    delete[] count1;
+  }
+
+  void LinearBarrierActive::init(size_t N) 
+  {
+    if (threadCount != N) {
+      threadCount = N;
+      if (count0) delete[] count0; count0 = new unsigned char[N];
+      if (count1) delete[] count1; count1 = new unsigned char[N];
+    }
+    mode      = 0;
+    flag0     = 0;
+    flag1     = 0;
+    for (size_t i=0; i<N; i++) count0[i] = 0;
+    for (size_t i=0; i<N; i++) count1[i] = 0;
+  }
+
+  void LinearBarrierActive::wait (const size_t threadIndex)
+  {
+    if (mode == 0)
+    {			
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count1[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {
+          while (likely(count0[i] == 0)) 
+            pause_cpu();
+        }
+        mode  = 1;
+        flag1 = 0;
+        __memory_barrier();
+        flag0 = 1;
+      }			
+      else
+      {					
+        count0[threadIndex] = 1;
+        {
+          while (likely(flag0 == 0))
+            pause_cpu();
+        }
+        
+      }		
+    }					
+    else						
+    {
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count0[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {		
+          while (likely(count1[i] == 0))
+            pause_cpu();
+        }
+        
+        mode  = 0;
+        flag0 = 0;
+        __memory_barrier();
+        flag1 = 1;
+      }			
+      else
+      {					
+        count1[threadIndex] = 1;
+        {
+          while (likely(flag1 == 0))
+            pause_cpu();
+        }
+      }		
+    }					
+  }
+
+  struct barrier_sys_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> threadID;
+    std::atomic<size_t> numFailed;
+    std::vector<size_t> threadResults;
+
+    barrier_sys_regression_test() 
+      : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(barrier_sys_regression_test* This)
+    {
+      size_t tid = This->threadID++;
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        This->threadResults[tid] = tid;
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      threadID.store(0);
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      threadResults.resize(numThreads);
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+        barrier.wait();
+        barrier.wait();
+        for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+      }
+
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      return numFailed == 0;
+    }
+  };
+
+  barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
diff --git a/thirdparty/embree-aarch64/common/sys/barrier.h b/thirdparty/embree-aarch64/common/sys/barrier.h
new file mode 100644
index 0000000000..89607b8685
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system barrier using operating system */
+  class BarrierSys
+  {
+  public:
+
+    /*! construction / destruction */
+    BarrierSys (size_t N = 0);
+    ~BarrierSys ();
+
+  private:
+    /*! class in non-copyable */
+    BarrierSys (const BarrierSys& other) DELETED; // do not implement
+    BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t count);
+
+    /*! lets calling thread wait in barrier */
+    void wait();
+
+  private:
+    void* opaque;
+  };
+
+  /*! fast active barrier using atomitc counter */
+  struct BarrierActive 
+  {
+  public:
+    BarrierActive () 
+      : cntr(0) {}
+    
+    void reset() {
+      cntr.store(0);
+    }
+
+    void wait (size_t numThreads) 
+    {
+      cntr++;
+      while (cntr.load() != numThreads) 
+        pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr;
+  };
+
+  /*! fast active barrier that does not require initialization to some number of threads */
+  struct BarrierActiveAutoReset
+  {
+  public:
+    BarrierActiveAutoReset () 
+      : cntr0(0), cntr1(0) {}
+
+    void wait (size_t threadCount) 
+    {
+      cntr0.fetch_add(1);
+      while (cntr0 != threadCount) pause_cpu();
+      cntr1.fetch_add(1);
+      while (cntr1 != threadCount) pause_cpu();
+      cntr0.fetch_add(-1);
+      while (cntr0 != 0) pause_cpu();
+      cntr1.fetch_add(-1);
+      while (cntr1 != 0) pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr0;
+    std::atomic<size_t> cntr1;
+  };
+
+  class LinearBarrierActive
+  {
+  public:
+
+    /*! construction and destruction */
+    LinearBarrierActive (size_t threadCount = 0);
+    ~LinearBarrierActive();
+    
+  private:
+    /*! class in non-copyable */
+    LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+    LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t threadCount);
+    
+    /*! thread with threadIndex waits in the barrier */
+    void wait (const size_t threadIndex);
+    
+  private:
+    volatile unsigned char* count0;
+    volatile unsigned char* count1; 
+    volatile unsigned int mode;
+    volatile unsigned int flag0;
+    volatile unsigned int flag1;
+    volatile size_t threadCount;
+  };
+}
+
diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp
new file mode 100644
index 0000000000..0e7ca7af39
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/condition.cpp
@@ -0,0 +1,81 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () {
+      InitializeConditionVariable(&cond);
+    }
+
+    __forceinline ~ConditionImplementation () {
+    }
+
+    __forceinline void wait(MutexSys& mutex_in) {
+      SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+    }
+
+    __forceinline void notify_all() {
+      WakeAllConditionVariable(&cond);
+    }
+
+  public:
+    CONDITION_VARIABLE cond;
+  };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () { 
+      pthread_cond_init(&cond,nullptr); 
+    }
+    
+    __forceinline ~ConditionImplementation() { 
+      pthread_cond_destroy(&cond);
+    } 
+    
+    __forceinline void wait(MutexSys& mutex) { 
+      pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); 
+    }
+    
+    __forceinline void notify_all() { 
+      pthread_cond_broadcast(&cond); 
+    }
+    
+  public:
+    pthread_cond_t cond;
+  };
+}
+#endif
+
+namespace embree 
+{
+  ConditionSys::ConditionSys () { 
+    cond = new ConditionImplementation; 
+  }
+
+  ConditionSys::~ConditionSys() { 
+    delete (ConditionImplementation*) cond;
+  }
+
+  void ConditionSys::wait(MutexSys& mutex) { 
+    ((ConditionImplementation*) cond)->wait(mutex);
+  }
+
+  void ConditionSys::notify_all() { 
+    ((ConditionImplementation*) cond)->notify_all();
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h
new file mode 100644
index 0000000000..7a3a05aa81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+  class ConditionSys
+  {
+  public:
+    ConditionSys();
+    ~ConditionSys();
+    void wait( class MutexSys& mutex );
+    void notify_all();
+
+    template<typename Predicate>
+      __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+    {
+      while (!pred()) wait(mutex);
+    }
+
+  private:
+    ConditionSys (const ConditionSys& other) DELETED; // do not implement
+    ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+  protected:
+    void* cond;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp
new file mode 100644
index 0000000000..86182c1afb
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/filename.cpp
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+  const char path_sep = '\\';
+#else
+  const char path_sep = '/';
+#endif
+
+  /*! create an empty filename */
+  FileName::FileName () {}
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const char* in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const std::string& in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+  
+  /*! returns path to home folder */
+  FileName FileName::homeFolder() 
+  {
+#ifdef __WIN32__
+    const char* home = getenv("UserProfile");
+#else
+    const char* home = getenv("HOME");
+#endif
+    if (home) return home;
+    return "";
+  }
+
+  /*! returns path to executable */
+  FileName FileName::executableFolder() {
+    return FileName(getExecutableFileName()).path();
+  }
+
+  /*! returns the path */
+  FileName FileName::path() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return FileName();
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename */
+  std::string FileName::base() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return filename;
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  std::string FileName::ext() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return "";
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  FileName FileName::dropExt() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return filename;
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename without extension */
+  std::string FileName::name() const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) end = filename.size();
+    return filename.substr(start, end - start);
+  }
+
+  /*! replaces the extension */
+  FileName FileName::setExt(const std::string& ext) const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) return FileName(filename+ext);
+    return FileName(filename.substr(0,end)+ext);
+  }
+
+  /*! adds the extension */
+  FileName FileName::addExt(const std::string& ext) const {
+    return FileName(filename+ext);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const FileName& other ) const {
+    if (filename == "") return FileName(other);
+    else return FileName(filename + path_sep + other.filename);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const std::string& other ) const {
+    return operator+(FileName(other));
+  }
+
+  /*! removes the base from a filename (if possible) */
+  FileName FileName::operator -( const FileName& base ) const {
+    size_t pos = filename.find_first_of(base);
+    if (pos == std::string::npos) return *this;
+    return FileName(filename.substr(pos+1));
+  }
+
+  /*! == operator */
+  bool operator== (const FileName& a, const FileName& b) {
+    return a.filename == b.filename;
+  }
+  
+  /*! != operator */
+  bool operator!= (const FileName& a, const FileName& b) {
+    return a.filename != b.filename;
+  }
+
+  /*! output operator */
+  std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+    return cout << filename.filename;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/filename.h b/thirdparty/embree-aarch64/common/sys/filename.h
new file mode 100644
index 0000000000..58f881b14d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/filename.h
@@ -0,0 +1,81 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! Convenience class for handling file names and paths. */
+  class FileName
+  {
+  public:
+
+    /*! create an empty filename */
+    FileName ();
+
+    /*! create a valid filename from a string */
+    FileName (const char* filename);
+
+    /*! create a valid filename from a string */
+    FileName (const std::string& filename);
+    
+    /*! returns path to home folder */
+    static FileName homeFolder();
+
+    /*! returns path to executable */
+    static FileName executableFolder();
+
+    /*! auto convert into a string */
+    operator std::string() const { return filename; }
+
+    /*! returns a string of the filename */
+    const std::string str() const { return filename; }
+
+    /*! returns a c-string of the filename */
+    const char* c_str() const { return filename.c_str(); }
+
+    /*! returns the path of a filename */
+    FileName path() const;
+
+    /*! returns the file of a filename  */
+    std::string base() const;
+
+    /*! returns the base of a filename without extension */
+    std::string name() const;
+
+    /*! returns the file extension */
+    std::string ext() const;
+
+    /*! drops the file extension */
+    FileName dropExt() const;
+
+    /*! replaces the file extension */
+    FileName setExt(const std::string& ext = "") const;
+
+    /*! adds file extension */
+    FileName addExt(const std::string& ext = "") const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const FileName& other ) const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const std::string& other ) const;
+
+    /*! removes the base from a filename (if possible) */
+    FileName operator -( const FileName& base ) const;
+
+    /*! == operator */
+    friend bool operator==(const FileName& a, const FileName& b);
+
+    /*! != operator */
+    friend bool operator!=(const FileName& a, const FileName& b);
+
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const FileName& filename);
+   
+  private:
+    std::string filename;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/intrinsics.h b/thirdparty/embree-aarch64/common/sys/intrinsics.h
new file mode 100644
index 0000000000..44cdbd8f0f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/intrinsics.h
@@ -0,0 +1,559 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "../math/AVX2NEON.h"
+#endif
+#else
+#include <immintrin.h>
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+  #if !defined(_tzcnt_u32)
+    #define _tzcnt_u32 __tzcnt_u32
+  #endif
+  #if !defined(_tzcnt_u64)
+    #define _tzcnt_u64 __tzcnt_u64
+  #endif
+#endif
+
+#if defined(__aarch64__)
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clz
+#endif
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clzll
+#endif
+#else
+#if defined(__LZCNT__)
+  #if !defined(_lzcnt_u32)
+    #define _lzcnt_u32 __lzcnt32
+  #endif
+  #if !defined(_lzcnt_u64)
+    #define _lzcnt_u64 __lzcnt64
+  #endif
+#endif
+#endif
+
+#if defined(__WIN32__)
+#  ifndef NOMINMAX
+#  define NOMINMAX
+#  endif
+#  include <windows.h>
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+  __forceinline size_t read_tsc()
+  {
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return (size_t)li.QuadPart;
+  }
+
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+
+  __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+    return _tzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+  }
+#endif
+
+  __forceinline int bscf(int& v)
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+  __forceinline unsigned bscf(unsigned& v)
+  {
+    unsigned i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bscf(size_t& v)
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__)  && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+    return 63 -_lzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+  }
+#endif
+
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);
+#endif
+  }
+
+  __forceinline int btc(int v, int i) {
+    long r = v; _bittestandcomplement(&r,i); return r;
+  }
+
+  __forceinline int bts(int v, int i) {
+    long r = v; _bittestandset(&r,i); return r;
+  }
+
+  __forceinline int btr(int v, int i) {
+    long r = v; _bittestandreset(&r,i); return r;
+  }
+
+#if defined(__X86_64__)
+
+  __forceinline size_t btc(size_t v, size_t i) {
+    size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+  }
+
+  __forceinline size_t bts(size_t v, size_t i) {
+    __int64 r = v; _bittestandset64(&r,i); return r;
+  }
+
+  __forceinline size_t btr(size_t v, size_t i) {
+    __int64 r = v; _bittestandreset64(&r,i); return r;
+  }
+
+#endif
+
+  __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+    return _InterlockedCompareExchange((volatile long*)p,v,c);
+  }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#else
+
+#if defined(__i386__) && defined(__PIC__)
+
+  __forceinline void __cpuid(int out[4], int op)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                  : "0"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2));
+  }
+
+#else
+
+  __forceinline void __cpuid(int out[4], int op) {
+#if defined(__ARM_NEON)
+    if (op == 0) { // Get CPU name
+      out[0] = 0x41524d20;
+      out[1] = 0x41524d20;
+      out[2] = 0x41524d20;
+      out[3] = 0x41524d20;
+    }
+#else
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+#endif
+  }
+
+#if !defined(__ARM_NEON)
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
+  }
+#endif
+
+#endif
+
+  __forceinline uint64_t read_tsc()  {
+#if defined(__ARM_NEON)
+    return 0; // FIXME(LTE): mimic rdtsc
+#else
+    uint32_t high,low;
+    asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+    return (((uint64_t)high) << 32) + (uint64_t)low;
+#endif
+  }
+
+  __forceinline int bsf(int v) {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#else
+    int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+#endif
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned bsf(unsigned v)
+  {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#else
+    unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+#endif
+  }
+#endif
+
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return _tzcnt_u64(v);
+#else
+    return _tzcnt_u32(v);
+#endif
+#elif defined(__ARM_NEON)
+    return __builtin_ctzl(v);
+#else
+    size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+  __forceinline int bscf(int& v)
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned int bscf(unsigned int& v)
+  {
+    unsigned int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+
+  __forceinline size_t bscf(size_t& v)
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
+#else
+    int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__)
+    return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
+#else
+    unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+#endif
+
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return 63 - _lzcnt_u64(v);
+#else
+    return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__aarch64__)
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#else
+    size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);
+#endif
+  }
+
+  __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__INTEL_COMPILER)
+    return _blsr_u64(v);
+#else
+#if defined(__X86_64__)
+    return __blsr_u64(v);
+#else
+    return __blsr_u32(v);
+#endif
+#endif
+#else
+    return v & (v-1);
+#endif
+  }
+
+  __forceinline int btc(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandcomplement(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    // *a = *a ^ (1 << b);
+    // return x;
+
+    // We only need `*a`
+    return (v ^ (1 << i));
+#else
+    int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
+  }
+
+  __forceinline int bts(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a | (1 << b);
+    //  return x;
+    return (v | (v << i));
+#else
+    int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline int btr(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandreset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a & ~(1 << b);
+    //  return x;
+    return (v & ~(v << i));
+#else
+    int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v ^ (1 << i));
+#else
+    size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
+  }
+
+  __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v | (v << i));
+#else
+    size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__ARM_NEON)
+    return (v & ~(v << i));
+#else
+    size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+    return __sync_val_compare_and_swap(value, comparand, input);
+  }
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+  __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+  __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+  __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+  __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+  __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+  __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+
+  __forceinline int popcnt(int in) {
+    return _mm_popcnt_u32(in);
+  }
+
+  __forceinline unsigned popcnt(unsigned in) {
+    return _mm_popcnt_u32(in);
+  }
+
+#if defined(__X86_64__) || defined(__ARM_NEON)
+  __forceinline size_t popcnt(size_t in) {
+    return _mm_popcnt_u64(in);
+  }
+#endif
+
+#endif
+
+  __forceinline uint64_t rdtsc()
+  {
+    int dummy[4];
+    __cpuid(dummy,0);
+    uint64_t clock = read_tsc();
+    __cpuid(dummy,0);
+    return clock;
+  }
+
+  __forceinline void pause_cpu(const size_t N = 8)
+  {
+    for (size_t i=0; i<N; i++)
+      _mm_pause();
+  }
+
+  /* prefetches */
+  __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+  __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+  __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+  __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+  __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+    _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);
+#endif
+  }
+
+  __forceinline void prefetchL1EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+
+  __forceinline void prefetchL2EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+#if defined(__AVX2__) && !defined(__aarch64__)
+   __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+   __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+   __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+   __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+   __forceinline float mm512_cvtss_f32(__m512 v) {
+     return _mm512_cvtss_f32(v);
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) {
+     return _mm512_mask2int(k1);
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) {
+     return _mm512_int2mask(mask);
+   }
+#else
+   __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+     return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+     return (int)k1;
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+     return (__mmask16)mask;
+   }
+#endif
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/sys/library.cpp b/thirdparty/embree-aarch64/common/sys/library.cpp
new file mode 100644
index 0000000000..899267a1e4
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+    std::string fullName = file+".dll";
+    FileName executable = getExecutableFileName();
+    HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+    return lib_t(handle);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str()));
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    FreeLibrary(HMODULE(lib));
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+#if defined(__MACOSX__)
+    std::string fullName = "lib"+file+".dylib";
+#else
+    std::string fullName = "lib"+file+".so";
+#endif
+    void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+    if (lib) return lib_t(lib);
+    FileName executable = getExecutableFileName();
+    lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+    if (lib == nullptr) {
+      const char* error = dlerror();
+      if (error) {
+        THROW_RUNTIME_ERROR(error);
+      } else {
+        THROW_RUNTIME_ERROR("could not load library "+executable.str());
+      }
+    }
+    return lib_t(lib);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return dlsym(lib,sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    dlclose(lib);
+  }
+}
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h
new file mode 100644
index 0000000000..c2164e9fbe
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! type for shared library */
+  typedef struct opaque_lib_t* lib_t;
+
+  /*! loads a shared library */
+  lib_t openLibrary(const std::string& file);
+
+  /*! returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym);
+
+  /*! unloads a shared library */
+  void closeLibrary(lib_t lib);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp
new file mode 100644
index 0000000000..11779bc9b9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/mutex.cpp
@@ -0,0 +1,58 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+  void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+  void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys() 
+  { 
+    mutex = new pthread_mutex_t; 
+    if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+  }
+  
+  MutexSys::~MutexSys() 
+  { 
+    MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+    assert(ok);
+    delete (pthread_mutex_t*)mutex; 
+    mutex = nullptr;
+  }
+  
+  void MutexSys::lock() 
+  { 
+    if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) 
+      THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+  }
+  
+  bool MutexSys::try_lock() { 
+    return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+  }
+  
+  void MutexSys::unlock() 
+  { 
+    if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+  }
+};
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h
new file mode 100644
index 0000000000..1164210f23
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/mutex.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system mutex */
+  class MutexSys {
+    friend struct ConditionImplementation;
+  public:
+    MutexSys();
+    ~MutexSys();
+
+  private:
+    MutexSys (const MutexSys& other) DELETED; // do not implement
+    MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+  public:
+    void lock();
+    bool try_lock();
+    void unlock();
+
+  protected:
+    void* mutex;
+  };
+
+  /*! spinning mutex */
+  class SpinLock
+  {
+  public:
+ 
+    SpinLock ()
+      : flag(false) {}
+
+    __forceinline bool isLocked() {
+      return flag.load();
+    }
+
+    __forceinline void lock()
+    {
+      while (true) 
+      {
+        while (flag.load()) 
+        {
+          _mm_pause(); 
+          _mm_pause();
+        }
+        
+        bool expected = false;
+        if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+          break;
+      }
+    }
+    
+    __forceinline bool try_lock()
+    {
+      bool expected = false;
+      if (flag.load() != expected) {
+        return false;
+      }
+      return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+    }
+
+    __forceinline void unlock() {
+      flag.store(false,std::memory_order_release);
+    }
+    
+    __forceinline void wait_until_unlocked() 
+    {
+      while(flag.load())
+      {
+        _mm_pause(); 
+        _mm_pause();
+      }
+    }
+
+  public:
+    atomic<bool> flag;
+  };
+
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+    Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+    ~Lock() { if (locked) mutex.unlock(); }
+    __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+    __forceinline bool isLocked() const { return locked; }
+  protected:
+    Mutex& mutex;
+    bool locked;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h
new file mode 100644
index 0000000000..737f14aa6e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/platform.h
@@ -0,0 +1,387 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define dll_export __declspec(dllexport)
+#define dll_import __declspec(dllimport)
+#else
+#define dll_export __attribute__ ((visibility ("default")))
+#define dll_import
+#endif
+
+#ifdef __WIN32__
+#if !defined(__noinline)
+#define __noinline             __declspec(noinline)
+#endif
+//#define __forceinline        __forceinline
+//#define __restrict           __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__           __restrict
+#else
+#define __restrict__           //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread)
+// NOTE: Require `-fms-extensions` for clang
+#define __thread               __declspec(thread)
+#endif
+#if !defined(__aligned)
+#if defined(__MINGW32__)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#else
+#define __aligned(...)           __declspec(align(__VA_ARGS__))
+#endif
+#endif
+//#define __FUNCTION__           __FUNCTION__
+#define debugbreak()           __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline             __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+//#define __restrict             __restrict
+//#define __thread               __thread
+#if !defined(__aligned)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__           __PRETTY_FUNCTION__
+#endif
+#define debugbreak()           asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+  #define DELETED
+#else
+  #define DELETED  = delete
+#endif
+
+// -- GODOT start --
+#ifndef likely
+// -- GODOT end --
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define   likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define   likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+// -- GODOT start --
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define THROW_RUNTIME_ERROR(str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(str);
+  #define THROW_RUNTIME_ERROR(str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define FATAL(x)   THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+  typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__X86_64__) || defined(__aarch64__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+  return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94  ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335)  // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+#  if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+#    pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+#    pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+#  endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable:  1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+
+/* embree output stream */
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__)
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+	__asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+					IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+					IACA_UD_BYTES}
+
+namespace embree
+{
+  template<typename Closure>
+    struct OnScopeExitHelper
+  {
+    OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+    ~OnScopeExitHelper() { if (active) f(); }
+    void deactivate() { active = false; }
+    bool active;
+    const Closure f;
+  };
+
+  template <typename Closure>
+    OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+    return OnScopeExitHelper<Closure>(f);
+  }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code)                                             \
+  auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+  template<typename Ty>
+    std::unique_ptr<Ty> make_unique(Ty* ptr) {
+    return std::unique_ptr<Ty>(ptr);
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h
new file mode 100644
index 0000000000..24648e6234
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+  struct NullTy {
+  };
+
+  extern MAYBE_UNUSED NullTy null;
+  
+  class RefCount
+  {
+  public:
+    RefCount(int val = 0) : refCounter(val) {}
+    virtual ~RefCount() {};
+  
+    virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+    virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+  private:
+    std::atomic<size_t> refCounter;
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reference to single object
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename Type>
+  class Ref
+  {
+  public:
+    Type* ptr;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Ref() : ptr(nullptr) {}
+    __forceinline Ref(NullTy) : ptr(nullptr) {}
+    __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+    __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+    __forceinline Ref(Type* const input) : ptr(input)
+    {
+      if (ptr)
+        ptr->refInc();
+    }
+
+    __forceinline ~Ref()
+    {
+      if (ptr)
+        ptr->refDec();
+    }
+
+    __forceinline Ref& operator =(const Ref& input)
+    {
+      if (input.ptr)
+        input.ptr->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Ref&& input)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      input.ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Type* const input)
+    {
+      if (input)
+        input->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(NullTy)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline operator bool() const { return ptr != nullptr; }
+
+    __forceinline const Type& operator  *() const { return *ptr; }
+    __forceinline       Type& operator  *()       { return *ptr; }
+    __forceinline const Type* operator ->() const { return  ptr; }
+    __forceinline       Type* operator ->()       { return  ptr; }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> cast()       { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> dynamicCast()       { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+  };
+
+  template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   <  b.ptr;   }
+
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy            ) { return a.ptr   == nullptr; }
+  template<typename Type> __forceinline bool operator ==(NullTy            , const Ref<Type>& b) { return nullptr == b.ptr;   }
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   == b.ptr;   }
+
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy            ) { return a.ptr   != nullptr; }
+  template<typename Type> __forceinline bool operator !=(NullTy            , const Ref<Type>& b) { return nullptr != b.ptr;   }
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   != b.ptr;   }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp
new file mode 100644
index 0000000000..d95ff8dfe0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/regression.cpp
@@ -0,0 +1,30 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "regression.h"
+
+namespace embree
+{
+  /* registerRegressionTest is invoked from static initializers, thus
+   * we cannot have the regression_tests variable as global static
+   * variable due to issues with static variable initialization
+   * order. */
+  std::vector<RegressionTest*>& get_regression_tests()
+  {
+    static std::vector<RegressionTest*> regression_tests;
+    return regression_tests;
+  } 
+
+  void registerRegressionTest(RegressionTest* test) 
+  {
+    get_regression_tests().push_back(test);
+  }
+
+  RegressionTest* getRegressionTest(size_t index)
+  {
+    if (index >= get_regression_tests().size())
+      return nullptr;
+    
+    return get_regression_tests()[index];
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h
new file mode 100644
index 0000000000..632f8d92cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/regression.h
@@ -0,0 +1,25 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#include <vector>
+
+namespace embree
+{
+  /*! virtual interface for all regression tests */
+  struct RegressionTest 
+  { 
+    RegressionTest (std::string name) : name(name) {}
+    virtual bool run() = 0;
+    std::string name;
+  };
+ 
+  /*! registers a regression test */
+  void registerRegressionTest(RegressionTest* test);
+
+  /*! run all regression tests */
+  RegressionTest* getRegressionTest(size_t index);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp
new file mode 100644
index 0000000000..931244383e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/string.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+  char to_lower(char c) { return char(tolower(int(c))); }
+  char to_upper(char c) { return char(toupper(int(c))); }
+  std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+  std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+  Vec2f string_to_Vec2f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next);
+    return Vec2f(x,y);
+  }
+  
+  Vec3f string_to_Vec3f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); 
+    return Vec3f(x,y,z);
+  }
+  
+  Vec4f string_to_Vec4f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); str = str.substr(next+1);
+    const float w = std::stof(str,&next);
+    return Vec4f(x,y,z,w);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h
new file mode 100644
index 0000000000..2e9b0f88c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/string.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+  class IOStreamStateRestorer 
+  {
+  public:
+    IOStreamStateRestorer(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~IOStreamStateRestorer() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+
+  std::string toLowerCase(const std::string& s);
+  std::string toUpperCase(const std::string& s);
+
+  Vec2f string_to_Vec2f ( std::string str );
+  Vec3f string_to_Vec3f ( std::string str );
+  Vec4f string_to_Vec4f ( std::string str );
+}
diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp
new file mode 100644
index 0000000000..1d11436770
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp
@@ -0,0 +1,676 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sysinfo.h"
+#include "intrinsics.h"
+#include "string.h"
+#include "ref.h"
+#if defined(__FREEBSD__)
+#include <sys/cpuset.h>
+#include <pthread_np.h>
+typedef cpuset_t cpu_set_t;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+  NullTy null;
+
+  std::string getPlatformName()
+  {
+#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "Android Linux (aarch64 / arm64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__)
+    return "Android Linux (x64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86))
+    return "Android Linux (x86)";
+#elif defined(__LINUX__) && !defined(__X86_64__)
+    return "Linux (32bit)";
+#elif defined(__LINUX__) && defined(__X86_64__)
+    return "Linux (64bit)";
+#elif defined(__FREEBSD__) && !defined(__X86_64__)
+    return "FreeBSD (32bit)";
+#elif defined(__FREEBSD__) && defined(__X86_64__)
+    return "FreeBSD (64bit)";
+#elif defined(__CYGWIN__) && !defined(__X86_64__)
+    return "Cygwin (32bit)";
+#elif defined(__CYGWIN__) && defined(__X86_64__)
+    return "Cygwin (64bit)";
+#elif defined(__WIN32__) && !defined(__X86_64__)
+    return "Windows (32bit)";
+#elif defined(__WIN32__) && defined(__X86_64__)
+    return "Windows (64bit)";
+#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__)
+    return "iOS Simulator (x64)";
+#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "iOS (aarch64 / arm64)";
+#elif defined(__MACOSX__) && !defined(__X86_64__)
+    return "Mac OS X (32bit)";
+#elif defined(__MACOSX__) && defined(__X86_64__)
+    return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && defined(__aarch64__)
+    return "Unix (aarch64)";
+#elif defined(__UNIX__) && !defined(__X86_64__)
+    return "Unix (32bit)";
+#elif defined(__UNIX__) && defined(__X86_64__)
+    return "Unix (64bit)";
+#else
+    return "Unknown";
+#endif
+  }
+
+  std::string getCompilerName()
+  {
+#if defined(__INTEL_COMPILER)
+    int icc_mayor = __INTEL_COMPILER / 100 % 100;
+    int icc_minor = __INTEL_COMPILER % 100;
+    std::string version = "Intel Compiler ";
+    version += toString(icc_mayor);
+    version += "." + toString(icc_minor);
+#if defined(__INTEL_COMPILER_UPDATE)
+    version += "." + toString(__INTEL_COMPILER_UPDATE);
+#endif
+    return version;
+#elif defined(__clang__)
+    return "CLANG " __clang_version__;
+#elif defined (__GNUC__)
+    return "GCC " __VERSION__;
+#elif defined(_MSC_VER)
+    std::string version = toString(_MSC_FULL_VER);
+    version.insert(4,".");
+    version.insert(9,".");
+    version.insert(2,".");
+    return "Visual C++ Compiler " + version;
+#else
+    return "Unknown Compiler";
+#endif
+  }
+
+  std::string getCPUVendor()
+  {
+    int cpuinfo[4];
+    __cpuid (cpuinfo, 0);
+    int name[4];
+    name[0] = cpuinfo[1];
+    name[1] = cpuinfo[3];
+    name[2] = cpuinfo[2];
+    name[3] = 0;
+    return (char*)name;
+  }
+
+  CPU getCPUModel()
+  {
+    if (getCPUVendor() != "GenuineIntel")
+      return CPU::UNKNOWN;
+
+    int out[4];
+    __cpuid(out, 0);
+    if (out[0] < 1) return CPU::UNKNOWN;
+    __cpuid(out, 1);
+
+    /* please see CPUID documentation for these formulas */
+    uint32_t family_ID          = (out[0] >>  8) & 0x0F;
+    uint32_t extended_family_ID = (out[0] >> 20) & 0xFF;
+    
+    uint32_t model_ID           = (out[0] >>  4) & 0x0F;
+    uint32_t extended_model_ID  = (out[0] >> 16) & 0x0F;
+    
+    uint32_t DisplayFamily = family_ID;
+    if (family_ID == 0x0F)
+      DisplayFamily += extended_family_ID;
+    
+    uint32_t DisplayModel = model_ID;
+    if (family_ID == 0x06 || family_ID == 0x0F)
+      DisplayModel += extended_model_ID << 4;
+
+    uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0);
+
+    // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel)
+    if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1;
+
+    if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
+    if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
+    
+    return CPU::UNKNOWN;
+  }
+
+  std::string stringOfCPUModel(CPU model)
+  {
+    switch (model) {
+    case CPU::XEON_ICE_LAKE           : return "Xeon Ice Lake";
+    case CPU::CORE_ICE_LAKE           : return "Core Ice Lake";
+    case CPU::CORE_TIGER_LAKE         : return "Core Tiger Lake";
+    case CPU::CORE_COMET_LAKE         : return "Core Comet Lake";
+    case CPU::CORE_CANNON_LAKE        : return "Core Cannon Lake";
+    case CPU::CORE_KABY_LAKE          : return "Core Kaby Lake";
+    case CPU::XEON_SKY_LAKE           : return "Xeon Sky Lake";
+    case CPU::CORE_SKY_LAKE           : return "Core Sky Lake";
+    case CPU::XEON_PHI_KNIGHTS_MILL   : return "Xeon Phi Knights Mill";
+    case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing";
+    case CPU::XEON_BROADWELL          : return "Xeon Broadwell";
+    case CPU::CORE_BROADWELL          : return "Core Broadwell";
+    case CPU::XEON_HASWELL            : return "Xeon Haswell";
+    case CPU::CORE_HASWELL            : return "Core Haswell";
+    case CPU::XEON_IVY_BRIDGE         : return "Xeon Ivy Bridge";
+    case CPU::CORE_IVY_BRIDGE         : return "Core Ivy Bridge";
+    case CPU::SANDY_BRIDGE            : return "Sandy Bridge";
+    case CPU::NEHALEM                 : return "Nehalem";
+    case CPU::CORE2                   : return "Core2";
+    case CPU::CORE1                   : return "Core";
+    case CPU::ARM                     : return "Arm";
+    case CPU::UNKNOWN                 : return "Unknown CPU";
+    }
+    return "Unknown CPU (error)";
+  }
+
+#if !defined(__ARM_NEON)
+  /* constants to access destination registers of CPUID instruction */
+  static const int EAX = 0;
+  static const int EBX = 1;
+  static const int ECX = 2;
+  static const int EDX = 3;
+
+  /* cpuid[eax=1].ecx */
+  static const int CPU_FEATURE_BIT_SSE3   = 1 << 0;
+  static const int CPU_FEATURE_BIT_SSSE3  = 1 << 9;
+  static const int CPU_FEATURE_BIT_FMA3   = 1 << 12;
+  static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19;
+  static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20;
+  //static const int CPU_FEATURE_BIT_MOVBE  = 1 << 22;
+  static const int CPU_FEATURE_BIT_POPCNT = 1 << 23;
+  //static const int CPU_FEATURE_BIT_XSAVE  = 1 << 26;
+  static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27;
+  static const int CPU_FEATURE_BIT_AVX    = 1 << 28;
+  static const int CPU_FEATURE_BIT_F16C   = 1 << 29;
+  static const int CPU_FEATURE_BIT_RDRAND = 1 << 30;
+
+  /* cpuid[eax=1].edx */
+  static const int CPU_FEATURE_BIT_SSE  = 1 << 25;
+  static const int CPU_FEATURE_BIT_SSE2 = 1 << 26;
+
+  /* cpuid[eax=0x80000001].ecx */
+  static const int CPU_FEATURE_BIT_LZCNT = 1 << 5;
+
+  /* cpuid[eax=7,ecx=0].ebx */
+  static const int CPU_FEATURE_BIT_BMI1    = 1 << 3;
+  static const int CPU_FEATURE_BIT_AVX2    = 1 << 5;
+  static const int CPU_FEATURE_BIT_BMI2    = 1 << 8;
+  static const int CPU_FEATURE_BIT_AVX512F = 1 << 16;     // AVX512F  (foundation)
+  static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17;    // AVX512DQ (doubleword and quadword instructions)
+  static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26;    // AVX512PF (prefetch gather/scatter instructions)
+  static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27;    // AVX512ER (exponential and reciprocal instructions)
+  static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28;    // AVX512CD (conflict detection instructions)
+  static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30;    // AVX512BW (byte and word instructions)
+  static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31;    // AVX512VL (vector length extensions)
+  static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21;  // AVX512IFMA (integer fused multiple-add instructions)
+
+  /* cpuid[eax=7,ecx=0].ecx */
+  static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1;   // AVX512VBMI (vector bit manipulation instructions)
+#endif
+
+#if !defined(__ARM_NEON)
+  __noinline int64_t get_xcr0()
+  {
+    // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466
+#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
+    int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+    xcr0 = _xgetbv(0);
+    return xcr0;
+#else
+    int xcr0 = 0;
+    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+    return xcr0;
+#endif
+  }
+#endif
+
+  int getCPUFeatures()
+  {
+#if defined(__ARM_NEON)
+      int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
+#if defined(NEON_AVX2_EMULATION)
+      cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
+      cpu_features |= CPU_FEATURE_XMM_ENABLED;
+      cpu_features |= CPU_FEATURE_YMM_ENABLED;
+      cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
+      cpu_features |= CPU_FEATURE_POPCNT;
+      cpu_features |= CPU_FEATURE_AVX;
+      cpu_features |= CPU_FEATURE_AVX2;
+      cpu_features |= CPU_FEATURE_FMA3;
+      cpu_features |= CPU_FEATURE_LZCNT;
+      cpu_features |= CPU_FEATURE_BMI1;
+      cpu_features |= CPU_FEATURE_BMI2;
+      cpu_features |= CPU_FEATURE_NEON_2X;
+
+
+ 
+#endif
+     return cpu_features;
+      
+#else
+    /* cache CPU features access */
+    static int cpu_features = 0;
+    if (cpu_features)
+      return cpu_features;
+
+    /* get number of CPUID leaves */
+    int cpuid_leaf0[4];
+    __cpuid(cpuid_leaf0, 0x00000000);
+    unsigned nIds = cpuid_leaf0[EAX];
+
+    /* get number of extended CPUID leaves */
+    int cpuid_leafe[4];
+    __cpuid(cpuid_leafe, 0x80000000);
+    unsigned nExIds = cpuid_leafe[EAX];
+
+    /* get CPUID leaves for EAX = 1,7, and 0x80000001 */
+    int cpuid_leaf_1[4] = { 0,0,0,0 };
+    int cpuid_leaf_7[4] = { 0,0,0,0 };
+    int cpuid_leaf_e1[4] = { 0,0,0,0 };
+    if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001);
+#if _WIN32
+#if _MSC_VER && (_MSC_FULL_VER < 160040219)
+#else
+    if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0);
+#endif
+#else
+    if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0);
+#endif
+    if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001);
+
+    /* detect if OS saves XMM, YMM, and ZMM states */
+    bool xmm_enabled = true;
+    bool ymm_enabled = false;
+    bool zmm_enabled = false;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) {
+      int64_t xcr0 = get_xcr0();
+      xmm_enabled = ((xcr0 & 0x02) == 0x02);                /* checks if xmm are enabled in XCR0 */
+      ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */
+      zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */
+    }
+    if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
+    if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
+    if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
+
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE   ) cpu_features |= CPU_FEATURE_SSE;
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2  ) cpu_features |= CPU_FEATURE_SSE2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3  ) cpu_features |= CPU_FEATURE_SSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX   ) cpu_features |= CPU_FEATURE_AVX;
+
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C  ) cpu_features |= CPU_FEATURE_F16C;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2  ) cpu_features |= CPU_FEATURE_AVX2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3  ) cpu_features |= CPU_FEATURE_FMA3;
+    if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2;
+
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F   ) cpu_features |= CPU_FEATURE_AVX512F;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ  ) cpu_features |= CPU_FEATURE_AVX512DQ;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF  ) cpu_features |= CPU_FEATURE_AVX512PF;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD  ) cpu_features |= CPU_FEATURE_AVX512CD;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW  ) cpu_features |= CPU_FEATURE_AVX512BW;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL  ) cpu_features |= CPU_FEATURE_AVX512VL;
+    if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
+
+    return cpu_features;
+#endif
+  }
+
+  std::string stringOfCPUFeatures(int features)
+  {
+    std::string str;
+    if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM ";
+    if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM ";
+    if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM ";
+    if (features & CPU_FEATURE_SSE   ) str += "SSE ";
+    if (features & CPU_FEATURE_SSE2  ) str += "SSE2 ";
+    if (features & CPU_FEATURE_SSE3  ) str += "SSE3 ";
+    if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 ";
+    if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 ";
+    if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 ";
+    if (features & CPU_FEATURE_POPCNT) str += "POPCNT ";
+    if (features & CPU_FEATURE_AVX   ) str += "AVX ";
+    if (features & CPU_FEATURE_F16C  ) str += "F16C ";
+    if (features & CPU_FEATURE_RDRAND) str += "RDRAND ";
+    if (features & CPU_FEATURE_AVX2  ) str += "AVX2 ";
+    if (features & CPU_FEATURE_FMA3  ) str += "FMA3 ";
+    if (features & CPU_FEATURE_LZCNT ) str += "LZCNT ";
+    if (features & CPU_FEATURE_BMI1  ) str += "BMI1 ";
+    if (features & CPU_FEATURE_BMI2  ) str += "BMI2 ";
+    if (features & CPU_FEATURE_AVX512F) str += "AVX512F ";
+    if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ ";
+    if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF ";
+    if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER ";
+    if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD ";
+    if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW ";
+    if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
+    if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
+    if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    if (features & CPU_FEATURE_NEON) str += "NEON ";
+    if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
+    return str;
+  }
+
+  std::string stringOfISA (int isa)
+  {
+    if (isa == SSE) return "SSE";
+    if (isa == SSE2) return "SSE2";
+    if (isa == SSE3) return "SSE3";
+    if (isa == SSSE3) return "SSSE3";
+    if (isa == SSE41) return "SSE4.1";
+    if (isa == SSE42) return "SSE4.2";
+    if (isa == AVX) return "AVX";
+    if (isa == AVX2) return "AVX2";
+    if (isa == AVX512KNL) return "AVX512KNL";
+    if (isa == AVX512SKX) return "AVX512SKX";
+    if (isa == NEON) return "NEON";    
+    if (isa == NEON_2X) return "2xNEON";
+    return "UNKNOWN";
+  }
+
+  bool hasISA(int features, int isa) {
+    return (features & isa) == isa;
+  }
+
+  std::string supportedTargetList (int features)
+  {
+    std::string v;
+    if (hasISA(features,SSE)) v += "SSE ";
+    if (hasISA(features,SSE2)) v += "SSE2 ";
+    if (hasISA(features,SSE3)) v += "SSE3 ";
+    if (hasISA(features,SSSE3)) v += "SSSE3 ";
+    if (hasISA(features,SSE41)) v += "SSE4.1 ";
+    if (hasISA(features,SSE42)) v += "SSE4.2 ";
+    if (hasISA(features,AVX)) v += "AVX ";
+    if (hasISA(features,AVXI)) v += "AVXI ";
+    if (hasISA(features,AVX2)) v += "AVX2 ";
+    if (hasISA(features,AVX512KNL)) v += "AVX512KNL ";
+    if (hasISA(features,AVX512SKX)) v += "AVX512SKX ";
+    if (hasISA(features,NEON)) v += "NEON ";
+    if (hasISA(features,NEON_2X)) v += "2xNEON ";
+    return v;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <psapi.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() {
+    char filename[1024];
+    if (!GetModuleFileName(nullptr, filename, sizeof(filename)))
+      return std::string();
+    return std::string(filename);
+  }
+
+  unsigned int getNumberOfLogicalThreads()
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc      pGetActiveProcessorCount      = (GetActiveProcessorCountFunc)     GetProcAddress(hlib, "GetActiveProcessorCount");
+
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount)
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0;
+      for (int i = 0; i < groups; i++)
+        totalProcessors += pGetActiveProcessorCount(i);
+      nThreads = totalProcessors;
+    }
+    else
+    {
+      SYSTEM_INFO sysinfo;
+      GetSystemInfo(&sysinfo);
+      nThreads = sysinfo.dwNumberOfProcessors;
+    }
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth()
+  {
+    HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (handle == INVALID_HANDLE_VALUE) return 80;
+    CONSOLE_SCREEN_BUFFER_INFO info;
+    memset(&info,0,sizeof(info));
+    GetConsoleScreenBufferInfo(handle, &info);
+    return info.dwSize.X;
+  }
+
+  double getSeconds()
+  {
+    LARGE_INTEGER freq, val;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&val);
+    return (double)val.QuadPart / (double)freq.QuadPart;
+  }
+
+  void sleepSeconds(double t) {
+    Sleep(DWORD(1000.0*t));
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.QuotaPeakPagedPoolUsage;
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.WorkingSetSize;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <stdio.h>
+#include <unistd.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    std::string pid = "/proc/" + toString(getpid()) + "/exe";
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return virt*sysconf(_SC_PAGE_SIZE);
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return resident*sysconf(_SC_PAGE_SIZE);
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__FreeBSD__)
+
+#include <sys/sysctl.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    size_t len = sizeof(buf)-1;
+    if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Mac OS X Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach-o/dyld.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    char buf[4096];
+    uint32_t size = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &size) != 0)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+namespace embree
+{
+  unsigned int getNumberOfLogicalThreads()
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+#if defined(__MACOSX__) || defined(__ANDROID__)
+    nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+    assert(nThreads);
+#else
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+      nThreads = CPU_COUNT(&set);
+#endif
+
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth()
+  {
+    struct winsize info;
+    if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
+    return info.ws_col;
+  }
+
+  double getSeconds() {
+    struct timeval tp; gettimeofday(&tp,nullptr);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+  }
+
+  void sleepSeconds(double t) {
+    usleep(1000000.0*t);
+  }
+}
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h
new file mode 100644
index 0000000000..8e313a59b3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/sysinfo.h
@@ -0,0 +1,192 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define CACHELINE_SIZE 64
+
+#if !defined(PAGE_SIZE)
+  #define PAGE_SIZE 4096
+#endif
+
+#define PAGE_SIZE_2M (2*1024*1024)
+#define PAGE_SIZE_4K (4*1024)
+
+#include "platform.h"
+
+/* define isa namespace and ISA bitvector */
+#if defined (__AVX512VL__)
+#  define isa avx512skx
+#  define ISA AVX512SKX
+#  define ISA_STR "AVX512SKX"
+#elif defined (__AVX512F__)
+#  define isa avx512knl
+#  define ISA AVX512KNL
+#  define ISA_STR "AVX512KNL"
+#elif defined (__AVX2__)
+#  define isa avx2
+#  define ISA AVX2
+#  define ISA_STR "AVX2"
+#elif defined(__AVXI__)
+#  define isa avxi
+#  define ISA AVXI
+#  define ISA_STR "AVXI"
+#elif defined(__AVX__)
+#  define isa avx
+#  define ISA AVX
+#  define ISA_STR "AVX"
+#elif defined (__SSE4_2__)
+#  define isa sse42
+#  define ISA SSE42
+#  define ISA_STR "SSE4.2"
+//#elif defined (__SSE4_1__) //  we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11
+//#  define isa sse41
+//#  define ISA SSE41
+//#  define ISA_STR "SSE4.1"
+//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC
+//#  define isa ssse3
+//#  define ISA SSSE3
+//#  define ISA_STR "SSSE3"
+//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang
+//#  define isa sse3
+//#  define ISA SSE3
+//#  define ISA_STR "SSE3"
+#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__)
+#  define isa sse2
+#  define ISA SSE2
+#  define ISA_STR "SSE2"
+#elif defined(__SSE__)
+#  define isa sse
+#  define ISA SSE
+#  define ISA_STR "SSE"
+#elif defined(__ARM_NEON)
+// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
+#define isa sse2
+#define ISA NEON
+#define ISA_STR "NEON"
+#else
+#error Unknown ISA
+#endif
+
+namespace embree
+{
+  enum class CPU
+  {
+    XEON_ICE_LAKE,
+    CORE_ICE_LAKE,
+    CORE_TIGER_LAKE,
+    CORE_COMET_LAKE,
+    CORE_CANNON_LAKE,
+    CORE_KABY_LAKE,
+    XEON_SKY_LAKE,
+    CORE_SKY_LAKE,
+    XEON_PHI_KNIGHTS_MILL,
+    XEON_PHI_KNIGHTS_LANDING,
+    XEON_BROADWELL,
+    CORE_BROADWELL,
+    XEON_HASWELL,
+    CORE_HASWELL,
+    XEON_IVY_BRIDGE,
+    CORE_IVY_BRIDGE,
+    SANDY_BRIDGE,
+    NEHALEM,
+    CORE2,
+    CORE1,
+    ARM,
+    UNKNOWN,
+  };
+  
+  /*! get the full path to the running executable */
+  std::string getExecutableFileName();
+
+  /*! return platform name */
+  std::string getPlatformName();
+
+  /*! get the full name of the compiler */
+  std::string getCompilerName();
+
+  /*! return the name of the CPU */
+  std::string getCPUVendor();
+
+  /*! get microprocessor model */
+  CPU getCPUModel(); 
+
+  /*! converts CPU model into string */
+  std::string stringOfCPUModel(CPU model);
+
+  /*! CPU features */
+  static const int CPU_FEATURE_SSE    = 1 << 0;
+  static const int CPU_FEATURE_SSE2   = 1 << 1;
+  static const int CPU_FEATURE_SSE3   = 1 << 2;
+  static const int CPU_FEATURE_SSSE3  = 1 << 3;
+  static const int CPU_FEATURE_SSE41  = 1 << 4;
+  static const int CPU_FEATURE_SSE42  = 1 << 5;
+  static const int CPU_FEATURE_POPCNT = 1 << 6;
+  static const int CPU_FEATURE_AVX    = 1 << 7;
+  static const int CPU_FEATURE_F16C   = 1 << 8;
+  static const int CPU_FEATURE_RDRAND = 1 << 9;
+  static const int CPU_FEATURE_AVX2   = 1 << 10;
+  static const int CPU_FEATURE_FMA3   = 1 << 11;
+  static const int CPU_FEATURE_LZCNT  = 1 << 12;
+  static const int CPU_FEATURE_BMI1   = 1 << 13;
+  static const int CPU_FEATURE_BMI2   = 1 << 14;
+  static const int CPU_FEATURE_AVX512F = 1 << 16;
+  static const int CPU_FEATURE_AVX512DQ = 1 << 17;
+  static const int CPU_FEATURE_AVX512PF = 1 << 18;
+  static const int CPU_FEATURE_AVX512ER = 1 << 19;
+  static const int CPU_FEATURE_AVX512CD = 1 << 20;
+  static const int CPU_FEATURE_AVX512BW = 1 << 21;
+  static const int CPU_FEATURE_AVX512VL = 1 << 22;
+  static const int CPU_FEATURE_AVX512IFMA = 1 << 23;
+  static const int CPU_FEATURE_AVX512VBMI = 1 << 24;
+  static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
+  static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
+  static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
+  static const int CPU_FEATURE_NEON = 1 << 28;
+  static const int CPU_FEATURE_NEON_2X = 1 << 29;
+
+  /*! get CPU features */
+  int getCPUFeatures();
+
+  /*! convert CPU features into a string */
+  std::string stringOfCPUFeatures(int features);
+
+  /*! creates a string of all supported targets that are supported */
+  std::string supportedTargetList (int isa);
+
+  /*! ISAs */
+  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED;
+  static const int SSE2   = SSE | CPU_FEATURE_SSE2;
+  static const int SSE3   = SSE2 | CPU_FEATURE_SSE3;
+  static const int SSSE3  = SSE3 | CPU_FEATURE_SSSE3;
+  static const int SSE41  = SSSE3 | CPU_FEATURE_SSE41;
+  static const int SSE42  = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT;
+  static const int AVX    = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED;
+  static const int AVXI   = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
+  static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
+  static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED;
+  static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+  static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
+  static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
+
+  /*! converts ISA bitvector into a string */
+  std::string stringOfISA(int features);
+
+  /*! return the number of logical threads of the system */
+  unsigned int getNumberOfLogicalThreads();
+
+  /*! returns the size of the terminal window in characters */
+  int getTerminalWidth();
+
+  /*! returns performance counter in seconds */
+  double getSeconds();
+
+  /*! sleeps the specified number of seconds */
+  void sleepSeconds(double t);
+
+  /*! returns virtual address space occupied by process */
+  size_t getVirtualMemoryBytes();
+
+  /*! returns resident memory required by process */
+  size_t getResidentMemoryBytes();
+}
diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp
new file mode 100644
index 0000000000..f9ea5b7d96
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/thread.cpp
@@ -0,0 +1,429 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "thread.h"
+#include "sysinfo.h"
+#include "string.h"
+
+#include <iostream>
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#else
+#include <xmmintrin.h>
+#endif
+
+#if defined(PTHREADS_WIN32)
+#pragma comment (lib, "pthreadVC.lib")
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /*! set the affinity of a given thread */
+  void setAffinity(HANDLE thread, ssize_t affinity)
+  {
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
+    typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
+    SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
+    SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0, group = 0, number = 0;
+      for (int i = 0; i<groups; i++) {
+        int processors = pGetActiveProcessorCount(i);
+        if (totalProcessors + processors > affinity) {
+          group = i;
+          number = (int)affinity - totalProcessors;
+          break;
+        }
+        totalProcessors += processors;
+      }
+
+      GROUP_AFFINITY groupAffinity;
+      groupAffinity.Group = (WORD)group;
+      groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
+      groupAffinity.Reserved[0] = 0;
+      groupAffinity.Reserved[1] = 0;
+      groupAffinity.Reserved[2] = 0;
+      if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
+        WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
+
+      PROCESSOR_NUMBER processorNumber;
+      processorNumber.Group = group;
+      processorNumber.Number = number;
+      processorNumber.Reserved = 0;
+      if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
+        WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
+    }
+    else
+    {
+      if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
+        WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
+      if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
+        WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
+      }
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity) {
+    setAffinity(GetCurrentThread(), affinity);
+  }
+
+  struct ThreadStartupData
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg)
+      : f(f), arg(arg) {}
+  public:
+    thread_func f;
+    void* arg;
+  };
+
+  DWORD WINAPI threadStartup(LPVOID ptr)
+  {
+    ThreadStartupData* parg = (ThreadStartupData*) ptr;
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    parg->f(parg->arg);
+    delete parg;
+    parg = nullptr;
+    return 0;
+  }
+
+#if !defined(PTHREADS_WIN32)
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
+    if (thread == nullptr) FATAL("CreateThread failed");
+    if (threadID >= 0) setAffinity(thread, threadID);
+    return thread_t(thread);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    SwitchToThread();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    WaitForSingleObject(HANDLE(tid), INFINITE);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() {
+    return tls_t(size_t(TlsAlloc()));
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) {
+    TlsSetValue(DWORD(size_t(tls)), ptr);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) {
+    return TlsGetValue(DWORD(size_t(tls)));
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) {
+    TlsFree(DWORD(size_t(tls)));
+  }
+#endif
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+#if defined(__ANDROID__)
+#include <pthread.h>
+#endif
+
+namespace embree
+{
+  static MutexSys mutex;
+  static std::vector<size_t> threadIDs;
+
+#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target
+  /* changes thread ID mapping such that we first fill up all thread on one core */
+  size_t mapThreadID(size_t threadID)
+  {
+    Lock<MutexSys> lock(mutex);
+
+    if (threadIDs.size() == 0)
+    {
+      /* parse thread/CPU topology */
+      for (size_t cpuID=0;;cpuID++)
+      {
+        std::fstream fs;
+        std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
+        fs.open (cpu.c_str(), std::fstream::in);
+        if (fs.fail()) break;
+
+        int i;
+        while (fs >> i)
+        {
+          if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
+            threadIDs.push_back(i);
+          if (fs.peek() == ',')
+            fs.ignore();
+        }
+        fs.close();
+      }
+
+#if 0
+      for (size_t i=0;i<threadIDs.size();i++)
+        std::cout << i << " -> " << threadIDs[i] << std::endl;
+#endif
+
+      /* verify the mapping and do not use it if the mapping has errors */
+      for (size_t i=0;i<threadIDs.size();i++) {
+        for (size_t j=0;j<threadIDs.size();j++) {
+          if (i != j && threadIDs[i] == threadIDs[j]) {
+            threadIDs.clear();
+          }
+        }
+      }
+    }
+
+    /* re-map threadIDs if mapping is available */
+    size_t ID = threadID;
+    if (threadID < threadIDs.size())
+      ID = threadIDs[threadID];
+
+    /* find correct thread to affinitize to */
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+    {
+      for (int i=0, j=0; i<CPU_SETSIZE; i++)
+      {
+        if (!CPU_ISSET(i,&set)) continue;
+
+        if (j == ID) {
+          ID = i;
+          break;
+        }
+        j++;
+      }
+    }
+
+    return ID;
+  }
+#endif
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+#if defined(__ANDROID__)
+    // TODO(LTE): Implement
+#else
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    size_t threadID = mapThreadID(affinity);
+    CPU_SET(threadID, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+#endif
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FreeBSD__)
+
+#include <pthread_np.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpuset_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// MacOSX Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
+#include <mach/mach_init.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    thread_affinity_policy ap;
+    ap.affinity_tag = affinity;
+    if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
+      WARNING("setting thread affinity failed"); // on purpose only a warning
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+
+#include <pthread.h>
+#include <sched.h>
+
+#if defined(__USE_NUMA__)
+#include <numa.h>
+#endif
+
+namespace embree
+{
+  struct ThreadStartupData
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg, int affinity)
+      : f(f), arg(arg), affinity(affinity) {}
+  public:
+    thread_func f;
+    void* arg;
+    ssize_t affinity;
+  };
+
+  static void* threadStartup(ThreadStartupData* parg)
+  {
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+
+    /*! Mac OS X does not support setting affinity at thread creation time */
+#if defined(__MACOSX__)
+    if (parg->affinity >= 0)
+	setAffinity(parg->affinity);
+#endif
+
+    parg->f(parg->arg);
+    delete parg;
+    parg = nullptr;
+    return nullptr;
+  }
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    /* set stack size */
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
+
+    /* create thread */
+    pthread_t* tid = new pthread_t;
+    if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
+      pthread_attr_destroy(&attr);
+      delete tid;
+      FATAL("pthread_create failed");
+    }
+    pthread_attr_destroy(&attr);
+
+    /* set affinity */
+#if defined(__LINUX__) && !defined(__ANDROID__)
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      threadID = mapThreadID(threadID);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#elif defined(__FreeBSD__)
+    if (threadID >= 0) {
+      cpuset_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#endif
+
+    return thread_t(tid);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    sched_yield();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
+      FATAL("pthread_join failed");
+    delete (pthread_t*)tid;
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls()
+  {
+    pthread_key_t* key = new pthread_key_t;
+    if (pthread_key_create(key,nullptr) != 0) {
+      delete key;
+      FATAL("pthread_key_create failed");
+    }
+
+    return tls_t(key);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls)
+  {
+    assert(tls);
+    return pthread_getspecific(*(pthread_key_t*)tls);
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr)
+  {
+    assert(tls);
+    if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
+      FATAL("pthread_setspecific failed");
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls)
+  {
+    assert(tls);
+    if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
+      FATAL("pthread_key_delete failed");
+    delete (pthread_key_t*)tls;
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h
new file mode 100644
index 0000000000..45da6e6a70
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/thread.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "mutex.h"
+#include "alloc.h"
+#include "vector.h"
+#include <vector>
+
+namespace embree
+{
+  /*! type for thread */
+  typedef struct opaque_thread_t* thread_t;
+
+  /*! signature of thread start function */
+  typedef void (*thread_func)(void*);
+
+  /*! creates a hardware thread running on specific logical thread */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1);
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity);
+
+  /*! the thread calling this function gets yielded */
+  void yield();
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid);
+
+  /*! type for handle to thread local storage */
+  typedef struct opaque_tls_t* tls_t;
+
+  /*! creates thread local storage */
+  tls_t createTls();
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr);
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls);
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h
new file mode 100644
index 0000000000..e41794de7c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/vector.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "alloc.h"
+#include <algorithm>
+
+namespace embree
+{
+   template<typename T, typename allocator>
+    class vector_t
+    {
+    public:
+      typedef T value_type;
+      typedef T* iterator;
+      typedef const T* const_iterator;
+    
+      __forceinline vector_t () 
+        : size_active(0), size_alloced(0), items(nullptr) {}
+    
+      __forceinline explicit vector_t (size_t sz) 
+        : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      template<typename M>
+      __forceinline explicit vector_t (M alloc, size_t sz) 
+      : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      __forceinline ~vector_t() {
+        clear();
+      }
+    
+      __forceinline vector_t (const vector_t& other)
+      {
+        size_active = other.size_active;
+        size_alloced = other.size_alloced;
+        items = alloc.allocate(size_alloced);
+        for (size_t i=0; i<size_active; i++) 
+          ::new (&items[i]) value_type(other.items[i]);
+      }
+    
+      __forceinline vector_t (vector_t&& other)
+        : alloc(std::move(other.alloc))
+      {
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+      }
+
+      __forceinline vector_t& operator=(const vector_t& other) 
+      {
+        resize(other.size_active);
+        for (size_t i=0; i<size_active; i++)
+          items[i] = value_type(other.items[i]);
+        return *this;
+      }
+
+      __forceinline vector_t& operator=(vector_t&& other) 
+      {
+        clear();
+        alloc = std::move(other.alloc);
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+        return *this;
+      }
+
+      /********************** Iterators  ****************************/
+    
+      __forceinline       iterator begin()       { return items; };
+      __forceinline const_iterator begin() const { return items; };
+
+      __forceinline       iterator end  ()       { return items+size_active; };
+      __forceinline const_iterator end  () const { return items+size_active; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return size_active == 0; }
+      __forceinline size_t size     () const { return size_active; }
+      __forceinline size_t capacity () const { return size_alloced; }
+
+
+      __forceinline void resize(size_t new_size) {
+        internal_resize(new_size,internal_grow_size(new_size));
+      }
+
+      __forceinline void reserve(size_t new_alloced) 
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return;
+
+        /* resize exact otherwise */
+        internal_resize(size_active,new_alloced);
+      }
+
+      __forceinline void shrink_to_fit() {
+        internal_resize(size_active,size_active);
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline T& front() const { assert(size_active > 0); return items[0]; };
+      __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+     
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& nt) 
+      {
+        const T v = nt; // need local copy as input reference could point to this vector
+        internal_resize(size_active,internal_grow_size(size_active+1));
+        ::new (&items[size_active++]) T(v);
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        size_active--;
+        alloc.destroy(&items[size_active]);
+      }
+
+      __forceinline void clear() 
+      {
+        /* destroy elements */
+        for (size_t i=0; i<size_active; i++)
+          alloc.destroy(&items[i]);
+        
+        /* free memory */
+        alloc.deallocate(items,size_alloced); 
+        items = nullptr;
+        size_active = size_alloced = 0;
+      }
+
+    /******************** Comparisons **************************/
+    
+    friend bool operator== (const vector_t& a, const vector_t& b) 
+    {
+      if (a.size() != b.size()) return false;
+      for (size_t i=0; i<a.size(); i++)
+        if (a[i] != b[i])
+          return false;
+      return true;
+    }
+
+    friend bool operator!= (const vector_t& a, const vector_t& b) {
+      return !(a==b);
+    }
+
+    private:
+
+      __forceinline void internal_resize_init(size_t new_active)
+      {
+        assert(size_active == 0); 
+        assert(size_alloced == 0);
+        assert(items == nullptr);
+        if (new_active == 0) return;
+        items = alloc.allocate(new_active);
+        for (size_t i=0; i<new_active; i++) ::new (&items[i]) T();
+        size_active = new_active;
+        size_alloced = new_active;
+      }
+
+      __forceinline void internal_resize(size_t new_active, size_t new_alloced)
+      {
+        assert(new_active <= new_alloced); 
+
+        /* destroy elements */
+        if (new_active < size_active) 
+        {
+          for (size_t i=new_active; i<size_active; i++)
+            alloc.destroy(&items[i]);
+          size_active = new_active;
+        }
+
+        /* only reallocate if necessary */
+        if (new_alloced == size_alloced) {
+          for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T;
+          size_active = new_active;
+          return;
+        }
+
+        /* reallocate and copy items */
+        T* old_items = items;
+        items = alloc.allocate(new_alloced);
+        for (size_t i=0; i<size_active; i++) {
+          ::new (&items[i]) T(std::move(old_items[i]));
+          alloc.destroy(&old_items[i]);
+        }
+
+        for (size_t i=size_active; i<new_active; i++) {
+          ::new (&items[i]) T;
+        }
+
+        alloc.deallocate(old_items,size_alloced);
+        size_active = new_active;
+        size_alloced = new_alloced;
+      }
+
+      __forceinline size_t internal_grow_size(size_t new_alloced)
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return size_alloced;
+
+        /* resize to next power of 2 otherwise */
+        size_t new_size_alloced = size_alloced;
+        while (new_size_alloced < new_alloced) {
+          new_size_alloced = std::max(size_t(1),2*new_size_alloced);
+        }
+        return new_size_alloced;
+      }
+
+    private:
+      allocator alloc;
+      size_t size_active;    // number of valid items
+      size_t size_alloced;   // number of items allocated
+      T* items;              // data array
+    };
+
+  /*! vector class that performs standard allocations */
+  template<typename T>
+    using vector = vector_t<T,std::allocator<T>>;
+
+  /*! vector class that performs aligned allocations */
+  template<typename T>
+    using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
+  
+  /*! vector class that performs OS allocations */
+  template<typename T>
+    using ovector = vector_t<T,os_allocator<T> >;
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..9940e068d0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+#  include "taskschedulerinternal.h"
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+#  include "taskschedulergcd.h"
+#elif defined(TASKING_TBB)
+#  include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+#  include "taskschedulerppl.h"
+#else
+#  error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
new file mode 100644
index 0000000000..d31f8bb478
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#include <dispatch/dispatch.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy() {}
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+        currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
+        return currentThreadIndex;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount()
+    {
+        return GCDNumThreads;
+    }
+
+    private:
+      static size_t GCDNumThreads;
+      static size_t currentThreadIndex;
+
+  };
+
+};
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ebf656d1a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,426 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+  RTC_NAMESPACE_BEGIN
+  
+  static MutexSys g_mutex;
+  size_t TaskScheduler::g_numThreads = 0;
+  __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+  std::vector<Ref<TaskScheduler>> g_instance_vector;
+  __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+  TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+  template<typename Predicate, typename Body>
+  __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+  {
+    while (true)
+    {
+      /*! some rounds that yield */
+      for (size_t i=0; i<32; i++)
+      {
+        /*! some spinning rounds */
+        const size_t threadCount = thread.threadCount();
+        for (size_t j=0; j<1024; j+=threadCount)
+        {
+          if (!pred()) return;
+          if (thread.scheduler->steal_from_other_threads(thread)) {
+            i=j=0;
+            body();
+          }
+        }
+        yield();
+      }
+    }
+  }
+
+  /*! run this task */
+  void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+  {
+    /* try to run if not already stolen */
+    if (try_switch_state(INITIALIZED,DONE))
+    {
+      Task* prevTask = thread.task;
+      thread.task = this;
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
+          closure->execute();
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
+      thread.task = prevTask;
+      add_dependencies(-1);
+    }
+
+    /* steal until all dependencies have completed */
+    steal_loop(thread,
+               [&] () { return dependencies>0; },
+               [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+    /* now signal our parent task that we are finished */
+    if (parent)
+      parent->add_dependencies(-1);
+  }
+
+    /*! run this task */
+  dll_export void TaskScheduler::Task::run (Thread& thread) {
+    run_internal(thread);
+  }
+
+  bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+  {
+    /* stop if we run out of local tasks or reach the waiting task */
+    if (right == 0 || &tasks[right-1] == parent)
+      return false;
+
+    /* execute task */
+    size_t oldRight = right;
+    tasks[right-1].run_internal(thread);
+    if (right != oldRight) {
+      THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+    }
+
+    /* pop task and closure from stack */
+    right--;
+    if (tasks[right].stackPtr != size_t(-1))
+      stackPtr = tasks[right].stackPtr;
+
+    /* also move left pointer */
+    if (left >= right) left.store(right.load());
+
+    return right != 0;
+  }
+
+  dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+    return execute_local_internal(thread,parent);
+  }
+
+  bool TaskScheduler::TaskQueue::steal(Thread& thread)
+  {
+    size_t l = left;
+    size_t r = right;
+    if (l < r)
+    {
+      l = left++;
+       if (l >= r)
+         return false;
+    }
+    else
+      return false;
+
+    if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+      return false;
+
+    thread.tasks.right++;
+    return true;
+  }
+
+  /* we steal from the left */
+  size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+  {
+    if (left >= right) return 0;
+    return tasks[left].N;
+  }
+
+  void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+  {
+    TaskScheduler::ThreadPool* pool = pair->first;
+    size_t threadIndex = pair->second;
+    delete pair;
+    pool->thread_loop(threadIndex);
+  }
+
+  TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+    : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+  dll_export void TaskScheduler::ThreadPool::startThreads()
+  {
+    if (running) return;
+    setNumThreads(numThreads,true);
+  }
+
+  void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+  {
+    Lock<MutexSys> lock(g_mutex);
+    assert(newNumThreads);
+    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+    // We are observing a few % gain by increasing number threads by 2 on aarch64.
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    numThreads = newNumThreads*2;
+#else
+    numThreads = newNumThreads;
+#endif
+    numThreads = newNumThreads;
+    if (!startThreads && !running) return;
+    running = true;
+    size_t numThreadsActive = numThreadsRunning;
+
+    mutex.lock();
+    numThreadsRunning = newNumThreads;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* start new threads */
+    for (size_t t=numThreadsActive; t<numThreads; t++)
+    {
+      if (t == 0) continue;
+      auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+      threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+    }
+
+    /* stop some threads if we reduce the number of threads */
+    for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+      if (t == 0) continue;
+      embree::join(threads.back());
+      threads.pop_back();
+    }
+  }
+
+  TaskScheduler::ThreadPool::~ThreadPool()
+  {
+    /* leave all taskschedulers */
+    mutex.lock();
+    numThreadsRunning = 0;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* wait for threads to terminate */
+    for (size_t i=0; i<threads.size(); i++)
+      embree::join(threads[i]);
+  }
+
+  dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+  {
+    mutex.lock();
+    schedulers.push_back(scheduler);
+    mutex.unlock();
+    condition.notify_all();
+  }
+
+  dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+  {
+    Lock<MutexSys> lock(mutex);
+    for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+      if (scheduler == *it) {
+        schedulers.erase(it);
+        return;
+      }
+    }
+  }
+
+  void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+  {
+    while (globalThreadIndex < numThreadsRunning)
+    {
+      Ref<TaskScheduler> scheduler = NULL;
+      ssize_t threadIndex = -1;
+      {
+        Lock<MutexSys> lock(mutex);
+        condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+        if (globalThreadIndex >= numThreadsRunning) break;
+        scheduler = schedulers.front();
+        threadIndex = scheduler->allocThreadIndex();
+      }
+      scheduler->thread_loop(threadIndex);
+    }
+  }
+
+  TaskScheduler::TaskScheduler()
+    : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+  {
+    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    for (size_t i=0; i<threadLocal.size(); i++)
+      threadLocal[i].store(nullptr);
+  }
+
+  TaskScheduler::~TaskScheduler()
+  {
+    assert(threadCounter == 0);
+  }
+
+  dll_export size_t TaskScheduler::threadID()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadIndex()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadCount() {
+    return threadPool->size();
+  }
+
+  dll_export TaskScheduler* TaskScheduler::instance()
+  {
+    if (g_instance == NULL) {
+      Lock<MutexSys> lock(g_mutex);
+      g_instance = new TaskScheduler;
+      g_instance_vector.push_back(g_instance);
+    }
+    return g_instance;
+  }
+
+  void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+  {
+    if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+    threadPool->setNumThreads(numThreads,start_threads);
+  }
+
+  void TaskScheduler::destroy() {
+    delete threadPool; threadPool = nullptr;
+  }
+
+  dll_export ssize_t TaskScheduler::allocThreadIndex()
+  {
+    size_t threadIndex = threadCounter++;
+    assert(threadIndex < threadLocal.size());
+    return threadIndex;
+  }
+
+  void TaskScheduler::join()
+  {
+    mutex.lock();
+    size_t threadIndex = allocThreadIndex();
+    condition.wait(mutex, [&] () { return hasRootTask.load(); });
+    mutex.unlock();
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
+  }
+
+  void TaskScheduler::reset() {
+    hasRootTask = false;
+  }
+
+  void TaskScheduler::wait_for_threads(size_t threadCount)
+  {
+    while (threadCounter < threadCount-1)
+      pause_cpu();
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+    return thread_local_thread;
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+  {
+    Thread* old = thread_local_thread;
+    thread_local_thread = thread;
+    return old;
+  }
+
+  dll_export bool TaskScheduler::wait()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread == nullptr) return true;
+    while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+    return thread->scheduler->cancellingException == nullptr;
+  }
+
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+  {
+    /* allocate thread structure */
+    std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+    Thread& thread = *mthread;
+    threadLocal[threadIndex].store(&thread);
+    Thread* oldThread = swapThread(&thread);
+
+    /* main thread loop */
+    while (anyTasksRunning)
+    {
+      steal_loop(thread,
+                 [&] () { return anyTasksRunning > 0; },
+                 [&] () {
+                   anyTasksRunning++;
+                   while (thread.tasks.execute_local_internal(thread,nullptr));
+                   anyTasksRunning--;
+                 });
+    }
+    threadLocal[threadIndex].store(nullptr);
+    swapThread(oldThread);
+
+    /* remember exception to throw */
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
+    /* wait for all threads to terminate */
+    threadCounter--;
+#if defined(__WIN32__)
+	size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+	while (threadCounter > 0) {
+#if defined(__WIN32__)
+          if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+            yield();
+          else
+            _mm_pause();
+	  loopIndex++;
+#else
+          yield();
+#endif
+	}
+    // -- GODOT start --
+    // return except;
+    return;
+    // -- GODOT end --
+  }
+
+  bool TaskScheduler::steal_from_other_threads(Thread& thread)
+  {
+    const size_t threadIndex = thread.threadIndex;
+    const size_t threadCount = this->threadCounter;
+
+    for (size_t i=1; i<threadCount; i++)
+    {
+      pause_cpu(32);
+      size_t otherThreadIndex = threadIndex+i;
+      if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+      Thread* othread = threadLocal[otherThreadIndex].load();
+      if (!othread)
+        continue;
+
+      if (othread->tasks.steal(thread))
+        return true;
+    }
+
+    return false;
+  }
+
+  dll_export void TaskScheduler::startThreads() {
+    threadPool->startThreads();
+  }
+
+  dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->add(scheduler);
+  }
+
+  dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->remove(scheduler);
+  }
+
+  RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8bd70b2b8c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+  /* The tasking system exports some symbols to be used by the tutorials. Thus we 
+     hide is also in the API namespace when requested. */
+  RTC_NAMESPACE_BEGIN
+
+  struct TaskScheduler : public RefCount
+  {
+    ALIGNED_STRUCT_(64);
+    friend class Device;
+
+    static const size_t TASK_STACK_SIZE = 4*1024;           //!< task structure stack
+    static const size_t CLOSURE_STACK_SIZE = 512*1024;    //!< stack for task closures
+
+    struct Thread;
+
+    /*! virtual interface for all tasks */
+    struct TaskFunction {
+      virtual void execute() = 0;
+    };
+
+    /*! builds a task interface from a closure */
+    template<typename Closure>
+    struct ClosureTaskFunction : public TaskFunction
+    {
+      Closure closure;
+      __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+      void execute() { closure(); };
+    };
+
+    struct __aligned(64) Task
+    {
+      /*! states a task can be in */
+      enum { DONE, INITIALIZED };
+
+      /*! switch from one state to another */
+      __forceinline void switch_state(int from, int to)
+      {
+	__memory_barrier();
+        MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+	assert(success);
+      }
+
+      /*! try to switch from one state to another */
+      __forceinline bool try_switch_state(int from, int to) {
+	__memory_barrier();
+	return state.compare_exchange_strong(from,to);
+      }
+
+       /*! increment/decrement dependency counter */
+      void add_dependencies(int n) {
+	dependencies+=n;
+      }
+
+      /*! initialize all tasks to DONE state by default */
+      __forceinline Task()
+	: state(DONE) {}
+
+      /*! construction of new task */
+      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      {
+        if (parent) parent->add_dependencies(+1);
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! construction of stolen task, stealing thread will decrement initial dependency */
+      __forceinline Task (TaskFunction* closure, Task* parent)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      {
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! try to steal this task */
+      bool try_steal(Task& child)
+      {
+        if (!stealable) return false;
+	if (!try_switch_state(INITIALIZED,DONE)) return false;
+	new (&child) Task(closure, this);
+        return true;
+      }
+
+      /*! run this task */
+      dll_export void run(Thread& thread);
+
+      void run_internal(Thread& thread);
+
+    public:
+      std::atomic<int> state;            //!< state this task is in
+      std::atomic<int> dependencies;     //!< dependencies to wait for
+      std::atomic<bool> stealable;       //!< true if task can be stolen
+      TaskFunction* closure;             //!< the closure to execute
+      Task* parent;                      //!< parent task to signal when we are finished
+      size_t stackPtr;                   //!< stack location where closure is stored
+      size_t N;                          //!< approximative size of task
+    };
+
+    struct TaskQueue
+    {
+      TaskQueue ()
+      : left(0), right(0), stackPtr(0) {}
+
+      __forceinline void* alloc(size_t bytes, size_t align = 64)
+      {
+        size_t ofs = bytes + ((align - stackPtr) & (align-1));
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
+        stackPtr += ofs;
+        return &stack[stackPtr-bytes];
+      }
+
+      template<typename Closure>
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      {
+        if (right >= TASK_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("task stack overflow");
+          abort();
+          // -- GODOT end --
+
+	/* allocate new task on right side of stack */
+        size_t oldStackPtr = stackPtr;
+        TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+        /* gcc 8 or later fails to compile without explicit .load() */
+        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
+        right++;
+
+	/* also move left pointer */
+	if (left >= right-1) left = right-1;
+      }
+
+      dll_export bool execute_local(Thread& thread, Task* parent);
+      bool execute_local_internal(Thread& thread, Task* parent);
+      bool steal(Thread& thread);
+      size_t getTaskSizeAtLeft();
+
+      bool empty() { return right == 0; }
+
+    public:
+
+      /* task stack */
+      Task tasks[TASK_STACK_SIZE];
+      __aligned(64) std::atomic<size_t> left;   //!< threads steal from left
+      __aligned(64) std::atomic<size_t> right;  //!< new tasks are added to the right
+
+      /* closure stack */
+      __aligned(64) char stack[CLOSURE_STACK_SIZE];
+      size_t stackPtr;
+    };
+
+    /*! thread local structure for each thread */
+    struct Thread
+    {
+      ALIGNED_STRUCT_(64);
+
+      Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+      : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+      __forceinline size_t threadCount() {
+        return scheduler->threadCounter;
+      }
+
+      size_t threadIndex;              //!< ID of this thread
+      TaskQueue tasks;                 //!< local task queue
+      Task* task;                      //!< current active task
+      Ref<TaskScheduler> scheduler;     //!< pointer to task scheduler
+    };
+
+    /*! pool of worker threads */
+    struct ThreadPool
+    {
+      ThreadPool (bool set_affinity);
+      ~ThreadPool ();
+
+      /*! starts the threads */
+      dll_export void startThreads();
+
+      /*! sets number of threads to use */
+      void setNumThreads(size_t numThreads, bool startThreads = false);
+
+      /*! adds a task scheduler object for scheduling */
+      dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+      /*! remove the task scheduler object again */
+      dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+      /*! returns number of threads of the thread pool */
+      size_t size() const { return numThreads; }
+
+      /*! main loop for all threads */
+      void thread_loop(size_t threadIndex);
+
+    private:
+      std::atomic<size_t> numThreads;
+      std::atomic<size_t> numThreadsRunning;
+      bool set_affinity;
+      std::atomic<bool> running;
+      std::vector<thread_t> threads;
+
+    private:
+      MutexSys mutex;
+      ConditionSys condition;
+      std::list<Ref<TaskScheduler> > schedulers;
+    };
+
+    TaskScheduler ();
+    ~TaskScheduler ();
+
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /*! lets new worker threads join the tasking system */
+    void join();
+    void reset();
+
+    /*! let a worker thread allocate a thread index */
+    dll_export ssize_t allocThreadIndex();
+
+    /*! wait for some number of threads available (threadCount includes main thread) */
+    void wait_for_threads(size_t threadCount);
+
+    /*! thread loop for all worker threads */
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
+
+    /*! steals a task from a different thread */
+    bool steal_from_other_threads(Thread& thread);
+
+    template<typename Predicate, typename Body>
+      static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+    {
+      if (useThreadPool) startThreads();
+
+      size_t threadIndex = allocThreadIndex();
+      std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+      Thread& thread = *mthread;
+      assert(threadLocal[threadIndex].load() == nullptr);
+      threadLocal[threadIndex] = &thread;
+      Thread* oldThread = swapThread(&thread);
+      thread.tasks.push_right(thread,size,closure);
+      {
+        Lock<MutexSys> lock(mutex);
+	anyTasksRunning++;
+        hasRootTask = true;
+        condition.notify_all();
+      }
+
+      if (useThreadPool) addScheduler(this);
+
+      while (thread.tasks.execute_local(thread,nullptr));
+      anyTasksRunning--;
+      if (useThreadPool) removeScheduler(this);
+
+      threadLocal[threadIndex] = nullptr;
+      swapThread(oldThread);
+
+      /* remember exception to throw */
+      std::exception_ptr except = nullptr;
+      if (cancellingException != nullptr) except = cancellingException;
+
+      /* wait for all threads to terminate */
+      threadCounter--;
+      while (threadCounter > 0) yield();
+      cancellingException = nullptr;
+
+      /* re-throw proper exception */
+      if (except != nullptr)
+        std::rethrow_exception(except);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(size_t size, const Closure& closure)
+    {
+      Thread* thread = TaskScheduler::thread();
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+      else                           instance()->spawn_root(closure,size);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(const Closure& closure) {
+      spawn(1,closure);
+    }
+
+    /* spawn a new task set  */
+    template<typename Index, typename Closure>
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    {
+      spawn(end-begin, [=]()
+        {
+	  if (end-begin <= blockSize) {
+	    return closure(range<Index>(begin,end));
+	  }
+	  const Index center = (begin+end)/2;
+	  spawn(begin,center,blockSize,closure);
+	  spawn(center,end  ,blockSize,closure);
+	  wait();
+	});
+    }
+
+    /* work on spawned subtasks and wait until all have finished */
+    dll_export static bool wait();
+
+    /* returns the ID of the current thread */
+    dll_export static size_t threadID();
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    dll_export static size_t threadIndex();
+
+    /* returns the total number of threads */
+    dll_export static size_t threadCount();
+
+  private:
+
+    /* returns the thread local task list of this worker thread */
+    dll_export static Thread* thread();
+
+    /* sets the thread local task list of this worker thread */
+    dll_export static Thread* swapThread(Thread* thread);
+
+    /*! returns the taskscheduler object to be used by the master thread */
+    dll_export static TaskScheduler* instance();
+
+    /*! starts the threads */
+    dll_export static void startThreads();
+
+    /*! adds a task scheduler object for scheduling */
+    dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+    /*! remove the task scheduler object again */
+    dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+  private:
+    std::vector<atomic<Thread*>> threadLocal;
+    std::atomic<size_t> threadCounter;
+    std::atomic<size_t> anyTasksRunning;
+    std::atomic<bool> hasRootTask;
+    std::exception_ptr cancellingException;
+    MutexSys mutex;
+    ConditionSys condition;
+
+  private:
+    static size_t g_numThreads;
+    static __thread TaskScheduler* g_instance;
+    static __thread Thread* thread_local_thread;
+    static ThreadPool* threadPool;
+  };
+
+  RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+    using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..776f98cdac
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID() {
+      return GetCurrentThreadId();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    /* FIXME: threadIndex is NOT supported by PPL! */
+    static __forceinline size_t threadIndex() {
+      return 0;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+      return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+    }
+  };
+};
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..98dba26871
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
@@ -0,0 +1,67 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+#  define NOMINMAX
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+      return tbb::task_arena::current_thread_index();
+#else
+      return 0;
+#endif
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::max_concurrency();
+#else
+      return tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+  };
+
+};
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore.h b/thirdparty/embree-aarch64/include/embree3/rtcore.h
new file mode 100644
index 0000000000..5830bb5880
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore.h
@@ -0,0 +1,14 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_config.h"
+#include "rtcore_common.h"
+#include "rtcore_device.h"
+#include "rtcore_buffer.h"
+#include "rtcore_ray.h"
+#include "rtcore_geometry.h"
+#include "rtcore_scene.h"
+#include "rtcore_builder.h"
+#include "rtcore_quaternion.h"
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h
new file mode 100644
index 0000000000..400b604aa5
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h
@@ -0,0 +1,51 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Types of buffers */
+enum RTCBufferType
+{
+  RTC_BUFFER_TYPE_INDEX            = 0,
+  RTC_BUFFER_TYPE_VERTEX           = 1,
+  RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2,
+  RTC_BUFFER_TYPE_NORMAL           = 3,
+  RTC_BUFFER_TYPE_TANGENT          = 4,
+  RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5,
+
+  RTC_BUFFER_TYPE_GRID                 = 8,
+
+  RTC_BUFFER_TYPE_FACE                 = 16,
+  RTC_BUFFER_TYPE_LEVEL                = 17,
+  RTC_BUFFER_TYPE_EDGE_CREASE_INDEX    = 18,
+  RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT   = 19,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX  = 20,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21,
+  RTC_BUFFER_TYPE_HOLE                 = 22,
+
+  RTC_BUFFER_TYPE_FLAGS = 32
+};
+
+/* Opaque buffer type */
+typedef struct RTCBufferTy* RTCBuffer;
+
+/* Creates a new buffer. */
+RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize);
+
+/* Creates a new shared buffer. */
+RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize);
+
+/* Returns a pointer to the buffer data. */
+RTC_API void* rtcGetBufferData(RTCBuffer buffer);
+
+/* Retains the buffer (increments the reference count). */
+RTC_API void rtcRetainBuffer(RTCBuffer buffer);
+
+/* Releases the buffer (decrements the reference count). */
+RTC_API void rtcReleaseBuffer(RTCBuffer buffer);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h
new file mode 100644
index 0000000000..d62a7f72cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_scene.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Opaque BVH type */
+typedef struct RTCBVHTy* RTCBVH;
+
+/* Input build primitives for the builder */
+struct RTC_ALIGN(32) RTCBuildPrimitive
+{
+  float lower_x, lower_y, lower_z; 
+  unsigned int geomID;
+  float upper_x, upper_y, upper_z;
+  unsigned int primID;
+};
+
+/* Opaque thread local allocator type */
+typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator;
+
+/* Callback to create a node */
+typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr);
+
+/* Callback to set the pointer to all children */
+typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr);
+
+/* Callback to set the bounds of all children */
+typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr);
+
+/* Callback to create a leaf node */
+typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr);
+
+/* Callback to split a build primitive */
+typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr);
+
+/* Build flags */
+enum RTCBuildFlags
+{
+  RTC_BUILD_FLAG_NONE    = 0,
+  RTC_BUILD_FLAG_DYNAMIC = (1 << 0),
+};
+
+enum RTCBuildConstants
+{
+  RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32
+};
+
+/* Input for builders */
+struct RTCBuildArguments
+{
+  size_t byteSize;
+  
+  enum RTCBuildQuality buildQuality;
+  enum RTCBuildFlags buildFlags;
+  unsigned int maxBranchingFactor;
+  unsigned int maxDepth;
+  unsigned int sahBlockSize;
+  unsigned int minLeafSize;
+  unsigned int maxLeafSize;
+  float traversalCost;
+  float intersectionCost;
+  
+  RTCBVH bvh;
+  struct RTCBuildPrimitive* primitives;
+  size_t primitiveCount;
+  size_t primitiveArrayCapacity;
+  
+  RTCCreateNodeFunction createNode;
+  RTCSetNodeChildrenFunction setNodeChildren;
+  RTCSetNodeBoundsFunction setNodeBounds;
+  RTCCreateLeafFunction createLeaf;
+  RTCSplitPrimitiveFunction splitPrimitive;
+  RTCProgressMonitorFunction buildProgress;
+  void* userPtr;
+};
+
+/* Returns the default build settings.  */
+RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments()
+{
+  struct RTCBuildArguments args;
+  args.byteSize = sizeof(args);
+  args.buildQuality = RTC_BUILD_QUALITY_MEDIUM;
+  args.buildFlags = RTC_BUILD_FLAG_NONE;
+  args.maxBranchingFactor = 2;
+  args.maxDepth = 32;
+  args.sahBlockSize = 1;
+  args.minLeafSize = 1;
+  args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF;
+  args.traversalCost = 1.0f;
+  args.intersectionCost = 1.0f;
+  args.bvh = NULL;
+  args.primitives = NULL;
+  args.primitiveCount = 0;
+  args.primitiveArrayCapacity = 0;
+  args.createNode = NULL;
+  args.setNodeChildren = NULL;
+  args.setNodeBounds = NULL;
+  args.createLeaf = NULL;
+  args.splitPrimitive = NULL;
+  args.buildProgress = NULL;
+  args.userPtr = NULL;
+  return args;
+}
+
+/* Creates a new BVH. */
+RTC_API RTCBVH rtcNewBVH(RTCDevice device);
+
+/* Builds a BVH. */
+RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args);
+
+/* Allocates memory using the thread local allocator. */
+RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align);
+
+/* Retains the BVH (increments reference count). */
+RTC_API void rtcRetainBVH(RTCBVH bvh);
+
+/* Releases the BVH (decrements reference count). */
+RTC_API void rtcReleaseBVH(RTCBVH bvh);
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_common.h b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h
new file mode 100644
index 0000000000..890e06faa3
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stddef.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "rtcore_config.h"
+
+RTC_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+#if defined(_M_X64)
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+#endif
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+#else
+#  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#if !defined (RTC_DEPRECATED)
+#ifdef __GNUC__
+  #define RTC_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+  #define RTC_DEPRECATED __declspec(deprecated)
+#else
+  #define RTC_DEPRECATED
+#endif
+#endif
+
+#if defined(_WIN32)
+#  define RTC_FORCEINLINE __forceinline
+#else
+#  define RTC_FORCEINLINE inline __attribute__((always_inline))
+#endif
+
+/* Invalid geometry ID */
+#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1)
+
+/* Maximum number of time steps */
+#define RTC_MAX_TIME_STEP_COUNT 129
+
+/* Formats of buffers and other data structures */
+enum RTCFormat
+{
+  RTC_FORMAT_UNDEFINED = 0,
+
+  /* 8-bit unsigned integer */
+  RTC_FORMAT_UCHAR = 0x1001,
+  RTC_FORMAT_UCHAR2,
+  RTC_FORMAT_UCHAR3,
+  RTC_FORMAT_UCHAR4,
+
+  /* 8-bit signed integer */
+  RTC_FORMAT_CHAR = 0x2001,
+  RTC_FORMAT_CHAR2,
+  RTC_FORMAT_CHAR3,
+  RTC_FORMAT_CHAR4,
+
+  /* 16-bit unsigned integer */
+  RTC_FORMAT_USHORT = 0x3001,
+  RTC_FORMAT_USHORT2,
+  RTC_FORMAT_USHORT3,
+  RTC_FORMAT_USHORT4,
+
+  /* 16-bit signed integer */
+  RTC_FORMAT_SHORT = 0x4001,
+  RTC_FORMAT_SHORT2,
+  RTC_FORMAT_SHORT3,
+  RTC_FORMAT_SHORT4,
+
+  /* 32-bit unsigned integer */
+  RTC_FORMAT_UINT = 0x5001,
+  RTC_FORMAT_UINT2,
+  RTC_FORMAT_UINT3,
+  RTC_FORMAT_UINT4,
+
+  /* 32-bit signed integer */
+  RTC_FORMAT_INT = 0x6001,
+  RTC_FORMAT_INT2,
+  RTC_FORMAT_INT3,
+  RTC_FORMAT_INT4,
+
+  /* 64-bit unsigned integer */
+  RTC_FORMAT_ULLONG = 0x7001,
+  RTC_FORMAT_ULLONG2,
+  RTC_FORMAT_ULLONG3,
+  RTC_FORMAT_ULLONG4,
+
+  /* 64-bit signed integer */
+  RTC_FORMAT_LLONG = 0x8001,
+  RTC_FORMAT_LLONG2,
+  RTC_FORMAT_LLONG3,
+  RTC_FORMAT_LLONG4,
+
+  /* 32-bit float */
+  RTC_FORMAT_FLOAT = 0x9001,
+  RTC_FORMAT_FLOAT2,
+  RTC_FORMAT_FLOAT3,
+  RTC_FORMAT_FLOAT4,
+  RTC_FORMAT_FLOAT5,
+  RTC_FORMAT_FLOAT6,
+  RTC_FORMAT_FLOAT7,
+  RTC_FORMAT_FLOAT8,
+  RTC_FORMAT_FLOAT9,
+  RTC_FORMAT_FLOAT10,
+  RTC_FORMAT_FLOAT11,
+  RTC_FORMAT_FLOAT12,
+  RTC_FORMAT_FLOAT13,
+  RTC_FORMAT_FLOAT14,
+  RTC_FORMAT_FLOAT15,
+  RTC_FORMAT_FLOAT16,
+
+  /* 32-bit float matrix (row-major order) */
+  RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122,
+  RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123,
+  RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124,
+  RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132,
+  RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133,
+  RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134,
+  RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142,
+  RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143,
+  RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144,
+
+  /* 32-bit float matrix (column-major order) */
+  RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222,
+  RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223,
+  RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224,
+  RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232,
+  RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233,
+  RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234,
+  RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242,
+  RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243,
+  RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244,
+
+  /* special 12-byte format for grids */
+  RTC_FORMAT_GRID = 0xA001
+};
+
+/* Build quality levels */
+enum RTCBuildQuality
+{
+  RTC_BUILD_QUALITY_LOW    = 0,
+  RTC_BUILD_QUALITY_MEDIUM = 1,
+  RTC_BUILD_QUALITY_HIGH   = 2,
+  RTC_BUILD_QUALITY_REFIT  = 3,
+};
+
+/* Axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCBounds
+{
+  float lower_x, lower_y, lower_z, align0;
+  float upper_x, upper_y, upper_z, align1;
+};
+
+/* Linear axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCLinearBounds
+{
+  struct RTCBounds bounds0;
+  struct RTCBounds bounds1;
+};
+
+/* Intersection context flags */
+enum RTCIntersectContextFlags
+{
+  RTC_INTERSECT_CONTEXT_FLAG_NONE       = 0,
+  RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays
+  RTC_INTERSECT_CONTEXT_FLAG_COHERENT   = (1 << 0)  // optimize for coherent rays
+};
+
+/* Arguments for RTCFilterFunctionN */
+struct RTCFilterFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  struct RTCHitN* hit;
+  unsigned int N;
+};
+
+/* Filter callback function */
+typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args);
+
+/* Intersection context passed to intersect/occluded calls */
+struct RTCIntersectContext
+{
+  enum RTCIntersectContextFlags flags;               // intersection flags
+  RTCFilterFunctionN filter;                         // filter function to execute
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  unsigned int instStackSize;                        // Number of instances currently on the stack.
+#endif
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids.
+  
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;                      // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context)
+{
+  unsigned l = 0;
+  context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT;
+  context->filter = NULL;
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  context->instStackSize = 0;
+#endif
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    context->instID[l] = RTC_INVALID_GEOMETRY_ID;
+  
+#if RTC_MIN_WIDTH
+  context->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Point query structure for closest point query */
+struct RTC_ALIGN(16) RTCPointQuery
+{
+  float x;                // x coordinate of the query point
+  float y;                // y coordinate of the query point
+  float z;                // z coordinate of the query point
+  float time;             // time of the point query
+  float radius;           // radius of the point query
+};
+
+/* Structure of a packet of 4 query points */
+struct RTC_ALIGN(16) RTCPointQuery4
+{
+  float x[4];                // x coordinate of the query point
+  float y[4];                // y coordinate of the query point
+  float z[4];                // z coordinate of the query point
+  float time[4];             // time of the point query
+  float radius[4];           // radius of the point query
+};
+
+/* Structure of a packet of 8 query points */
+struct RTC_ALIGN(32) RTCPointQuery8
+{
+  float x[8];                // x coordinate of the query point
+  float y[8];                // y coordinate of the query point
+  float z[8];                // z coordinate of the query point
+  float time[8];             // time of the point query
+  float radius[8];           // radius ofr the point query
+};
+
+/* Structure of a packet of 16 query points */
+struct RTC_ALIGN(64) RTCPointQuery16
+{
+  float x[16];                // x coordinate of the query point
+  float y[16];                // y coordinate of the query point
+  float z[16];                // z coordinate of the query point
+  float time[16];             // time of the point quey
+  float radius[16];           // radius of the point query
+};
+
+struct RTCPointQueryN;
+
+struct RTC_ALIGN(16) RTCPointQueryContext
+{
+  // accumulated 4x4 column major matrices from world space to instance space.
+  // undefined if size == 0.
+  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+
+  // accumulated 4x4 column major matrices from instance space to world space.
+  // undefined if size == 0.
+  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+
+  // instance ids.
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+
+  // number of instances currently on the stack.
+  unsigned int instStackSize;
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context)
+{
+  context->instStackSize = 0;
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
+{
+  // The (world space) query object that was passed as an argument of rtcPointQuery. The
+  // radius of the query can be decreased inside the callback to shrink the
+  // search domain. Increasing the radius or modifying the time or position of
+  // the query results in undefined behaviour.
+  struct RTCPointQuery* query;
+
+  // Used for user input/output data. Will not be read or modified internally.
+  void* userPtr;
+
+  // primitive and geometry ID of primitive
+  unsigned int  primID;
+  unsigned int  geomID;
+
+  // the context with transformation and instance ID stack
+  struct RTCPointQueryContext* context;
+
+  // If the current instance transform M (= context->world2inst[context->instStackSize])
+  // is a similarity matrix, i.e there is a constant factor similarityScale such that,
+  //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
+  // The similarity scale is 0, if the current instance transform is not a
+  // similarity transform and vice versa. The similarity scale allows to compute
+  // distance information in instance space and scale the distances into world
+  // space by dividing with the similarity scale, for example, to update the
+  // query radius. If the current instance transform is not a similarity
+  // transform (similarityScale = 0), the distance computation has to be
+  // performed in world space to ensure correctness. if there is no instance
+  // transform (context->instStackSize == 0), the similarity scale is 1.
+  float similarityScale;
+};
+
+typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_config.h b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h
new file mode 100644
index 0000000000..337d4e9487
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h
@@ -0,0 +1,57 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR 3
+#define RTC_VERSION_MINOR 12
+#define RTC_VERSION_PATCH 1
+#define RTC_VERSION 31201
+#define RTC_VERSION_STRING "3.12.1"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {
+#  define RTC_NAMESPACE_END }
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_device.h b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h
new file mode 100644
index 0000000000..594e2b755d
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h
@@ -0,0 +1,87 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque device type */
+typedef struct RTCDeviceTy* RTCDevice;
+
+/* Creates a new Embree device. */
+RTC_API RTCDevice rtcNewDevice(const char* config);
+
+/* Retains the Embree device (increments the reference count). */
+RTC_API void rtcRetainDevice(RTCDevice device);
+  
+/* Releases an Embree device (decrements the reference count). */
+RTC_API void rtcReleaseDevice(RTCDevice device);
+
+/* Device properties */
+enum RTCDeviceProperty
+{
+  RTC_DEVICE_PROPERTY_VERSION       = 0,
+  RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1,
+  RTC_DEVICE_PROPERTY_VERSION_MINOR = 2,
+  RTC_DEVICE_PROPERTY_VERSION_PATCH = 3,
+
+  RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED  = 32,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED  = 33,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34,
+  RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED   = 35,
+
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63,
+  RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED          = 64,
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED    = 65,
+  RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED   = 66,
+  RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67,
+  RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED       = 68,
+
+  RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED    = 96,
+  RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED        = 97,
+  RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98,
+  RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED       = 99,
+  RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED        = 100,
+  RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED       = 101,
+
+  RTC_DEVICE_PROPERTY_TASKING_SYSTEM        = 128,
+  RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129,
+  RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130
+};
+
+/* Gets a device property. */
+RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop);
+
+/* Sets a device property. */
+RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value);
+  
+/* Error codes */
+enum RTCError
+{
+  RTC_ERROR_NONE              = 0,
+  RTC_ERROR_UNKNOWN           = 1,
+  RTC_ERROR_INVALID_ARGUMENT  = 2,
+  RTC_ERROR_INVALID_OPERATION = 3,
+  RTC_ERROR_OUT_OF_MEMORY     = 4,
+  RTC_ERROR_UNSUPPORTED_CPU   = 5,
+  RTC_ERROR_CANCELLED         = 6
+};
+
+/* Returns the error code. */
+RTC_API enum RTCError rtcGetDeviceError(RTCDevice device);
+
+/* Error callback function */
+typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str);
+
+/* Sets the error callback function. */
+RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr);
+
+/* Memory monitor callback function */
+typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post);
+
+/* Sets the memory monitor callback function. */
+RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h
new file mode 100644
index 0000000000..c70f1b0e5c
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h
@@ -0,0 +1,383 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_buffer.h"
+#include "rtcore_quaternion.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque scene type */
+typedef struct RTCSceneTy* RTCScene;
+
+/* Opaque geometry type */
+typedef struct RTCGeometryTy* RTCGeometry;
+
+/* Types of geometries */
+enum RTCGeometryType
+{
+  RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh
+  RTC_GEOMETRY_TYPE_QUAD     = 1, // quad (triangle pair) mesh
+  RTC_GEOMETRY_TYPE_GRID     = 2, // grid mesh
+
+  RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface
+
+  RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE   = 15, // Cone linear curves - discontinuous at edge boundaries 
+  RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE  = 16, // Round (rounded cone like) linear curves 
+  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE   = 17, // flat (ribbon-like) linear curves
+
+  RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE  = 24, // round (tube-like) Bezier curves
+  RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE   = 25, // flat (ribbon-like) Bezier curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  = 26, // flat normal-oriented Bezier curves
+  
+  RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves
+  RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE  = 33, // flat (ribbon-like) B-spline curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE  = 34, // flat normal-oriented B-spline curves
+
+  RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves
+  RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE  = 41, // flat (ribbon-like) Hermite curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE  = 42, // flat normal-oriented Hermite curves
+
+  RTC_GEOMETRY_TYPE_SPHERE_POINT = 50,
+  RTC_GEOMETRY_TYPE_DISC_POINT = 51,
+  RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52,
+
+  RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE  = 59, // flat (ribbon-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE  = 60, // flat normal-oriented Catmull-Rom curves
+
+  RTC_GEOMETRY_TYPE_USER     = 120, // user-defined geometry
+  RTC_GEOMETRY_TYPE_INSTANCE = 121  // scene instance
+};
+
+/* Interpolation modes for subdivision surfaces */
+enum RTCSubdivisionMode
+{
+  RTC_SUBDIVISION_MODE_NO_BOUNDARY     = 0,
+  RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1,
+  RTC_SUBDIVISION_MODE_PIN_CORNERS     = 2,
+  RTC_SUBDIVISION_MODE_PIN_BOUNDARY    = 3,
+  RTC_SUBDIVISION_MODE_PIN_ALL         = 4,
+};
+
+/* Curve segment flags */
+enum RTCCurveFlags
+{
+  RTC_CURVE_FLAG_NEIGHBOR_LEFT  = (1 << 0), // left segments exists
+  RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1)  // right segment exists
+};
+
+/* Arguments for RTCBoundsFunction */
+struct RTCBoundsFunctionArguments
+{
+  void* geometryUserPtr;
+  unsigned int primID;
+  unsigned int timeStep;
+  struct RTCBounds* bounds_o;
+};
+
+/* Bounding callback function */
+typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args);
+
+/* Arguments for RTCIntersectFunctionN */
+struct RTCIntersectFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayHitN* rayhit;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Intersection callback function */
+typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
+
+/* Arguments for RTCOccludedFunctionN */
+struct RTCOccludedFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Occlusion callback function */
+typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
+
+/* Arguments for RTCDisplacementFunctionN */
+struct RTCDisplacementFunctionNArguments
+{
+  void* geometryUserPtr;
+  RTCGeometry geometry;
+  unsigned int primID;
+  unsigned int timeStep;
+  const float* u;
+  const float* v;
+  const float* Ng_x;
+  const float* Ng_y;
+  const float* Ng_z;
+  float* P_x;
+  float* P_y;
+  float* P_z;
+  unsigned int N;
+};
+
+/* Displacement mapping callback function */
+typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args);
+
+/* Creates a new geometry of specified type. */
+RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type);
+
+/* Retains the geometry (increments the reference count). */
+RTC_API void rtcRetainGeometry(RTCGeometry geometry);
+
+/* Releases the geometry (decrements the reference count) */
+RTC_API void rtcReleaseGeometry(RTCGeometry geometry);
+
+/* Commits the geometry. */
+RTC_API void rtcCommitGeometry(RTCGeometry geometry);
+
+
+/* Enables the geometry. */
+RTC_API void rtcEnableGeometry(RTCGeometry geometry);
+
+/* Disables the geometry. */
+RTC_API void rtcDisableGeometry(RTCGeometry geometry);
+
+
+/* Sets the number of motion blur time steps of the geometry. */
+RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount);
+
+/* Sets the motion blur time range of the geometry. */
+RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime);
+  
+/* Sets the number of vertex attributes of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount);
+
+/* Sets the ray mask of the geometry. */
+RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask);
+
+/* Sets the build quality of the geometry. */
+RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality);
+
+/* Sets the maximal curve or point radius scale allowed by min-width feature. */
+RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale);
+
+
+/* Sets a geometry buffer. */
+RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Sets a shared geometry buffer. */
+RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Creates and sets a new geometry buffer. */
+RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount);
+
+/* Returns the pointer to the data of a buffer. */
+RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+/* Updates a geometry buffer. */
+RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+
+/* Sets the intersection filter callback function of the geometry. */
+RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the occlusion filter callback function of the geometry. */
+RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the user-defined data pointer of the geometry. */
+RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr);
+
+/* Gets the user-defined data pointer of the geometry. */
+RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry);
+
+/* Set the point query callback function of a geometry. */
+RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery);
+
+/* Sets the number of primitives of a user geometry. */
+RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount);
+
+/* Sets the bounding callback function to calculate bounding boxes for user primitives. */
+RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr);
+
+/* Set the intersect callback function of a user geometry. */
+RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect);
+
+/* Set the occlusion callback function of a user geometry. */
+RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded);
+
+/* Invokes the intersection filter from the intersection callback function. */
+RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+/* Invokes the occlusion filter from the occlusion callback function. */
+RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+
+/* Sets the instanced scene of an instance geometry. */
+RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene);
+
+/* Sets the transformation of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm);
+
+/* Sets the transformation quaternion of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd);
+
+/* Returns the interpolated transformation of an instance for the specified time. */
+RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm);
+
+
+/* Sets the uniform tessellation rate of the geometry. */
+RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate);
+
+/* Sets the number of topologies of a subdivision surface. */
+RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount);
+
+/* Sets the subdivision interpolation mode. */
+RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode);
+
+/* Binds a vertex attribute to a topology of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID);
+
+/* Sets the displacement callback function of a subdivision surface. */
+RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement);
+
+/* Returns the first half edge of a face. */
+RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID);
+
+/* Returns the face the half edge belongs to. */
+RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns next half edge. */
+RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns previous half edge. */
+RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns opposite half edge. */
+RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID);
+
+
+/* Arguments for rtcInterpolate */
+struct RTCInterpolateArguments
+{
+  RTCGeometry geometry;
+  unsigned int primID;
+  float u;
+  float v;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */
+RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args);
+
+/* Interpolates vertex data to some u/v location. */
+RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = NULL;
+  args.dPdv = NULL;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = ddPdudu;
+  args.ddPdvdv = ddPdvdv;
+  args.ddPdudv = ddPdudv;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Arguments for rtcInterpolateN */
+struct RTCInterpolateNArguments
+{
+  RTCGeometry geometry;
+  const void* valid;
+  const unsigned int* primIDs;
+  const float* u;
+  const float* v;
+  unsigned int N;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to an array of u/v locations. */
+RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args);
+
+/* RTCGrid primitive for grid mesh */
+struct RTCGrid
+{
+  unsigned int startVertexID;
+  unsigned int stride;
+  unsigned short width,height; // max is a 32k x 32k grid
+};
+
+RTC_NAMESPACE_END
+
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h
new file mode 100644
index 0000000000..449cdedfdc
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/*
+ * Structure for transformation respresentation as a matrix decomposition using
+ * a quaternion
+ */
+struct RTC_ALIGN(16) RTCQuaternionDecomposition
+{
+  float scale_x;
+  float scale_y;
+  float scale_z;
+  float skew_xy;
+  float skew_xz;
+  float skew_yz;
+  float shift_x;
+  float shift_y;
+  float shift_z;
+  float quaternion_r;
+  float quaternion_i;
+  float quaternion_j;
+  float quaternion_k;
+  float translation_x;
+  float translation_y;
+  float translation_z;
+};
+
+RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp)
+{
+  qdecomp->scale_x = 1.f;
+  qdecomp->scale_y = 1.f;
+  qdecomp->scale_z = 1.f;
+  qdecomp->skew_xy = 0.f;
+  qdecomp->skew_xz = 0.f;
+  qdecomp->skew_yz = 0.f;
+  qdecomp->shift_x = 0.f;
+  qdecomp->shift_y = 0.f;
+  qdecomp->shift_z = 0.f;
+  qdecomp->quaternion_r = 1.f;
+  qdecomp->quaternion_i = 0.f;
+  qdecomp->quaternion_j = 0.f;
+  qdecomp->quaternion_k = 0.f;
+  qdecomp->translation_x = 0.f;
+  qdecomp->translation_y = 0.f;
+  qdecomp->translation_z = 0.f;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float r, float i, float j, float k)
+{
+  qdecomp->quaternion_r = r;
+  qdecomp->quaternion_i = i;
+  qdecomp->quaternion_j = j;
+  qdecomp->quaternion_k = k;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float scale_x, float scale_y, float scale_z)
+{
+  qdecomp->scale_x = scale_x;
+  qdecomp->scale_y = scale_y;
+  qdecomp->scale_z = scale_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float skew_xy, float skew_xz, float skew_yz)
+{
+  qdecomp->skew_xy = skew_xy;
+  qdecomp->skew_xz = skew_xz;
+  qdecomp->skew_yz = skew_yz;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float shift_x, float shift_y, float shift_z)
+{
+  qdecomp->shift_x = shift_x;
+  qdecomp->shift_y = shift_y;
+  qdecomp->shift_z = shift_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float translation_x, float translation_y, float translation_z)
+{
+  qdecomp->translation_x = translation_x;
+  qdecomp->translation_y = translation_y;
+  qdecomp->translation_z = translation_z;
+}
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h
new file mode 100644
index 0000000000..1ae3309ef1
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h
@@ -0,0 +1,378 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Ray structure for a single ray */
+struct RTC_ALIGN(16) RTCRay
+{
+  float org_x;        // x coordinate of ray origin
+  float org_y;        // y coordinate of ray origin
+  float org_z;        // z coordinate of ray origin
+  float tnear;        // start of ray segment
+
+  float dir_x;        // x coordinate of ray direction
+  float dir_y;        // y coordinate of ray direction
+  float dir_z;        // z coordinate of ray direction
+  float time;         // time of this ray for motion blur
+
+  float tfar;         // end of ray segment (set to hit distance)
+  unsigned int mask;  // ray mask
+  unsigned int id;    // ray ID
+  unsigned int flags; // ray flags
+};
+
+/* Hit structure for a single ray */
+struct RTC_ALIGN(16) RTCHit
+{
+  float Ng_x;          // x coordinate of geometry normal
+  float Ng_y;          // y coordinate of geometry normal
+  float Ng_z;          // z coordinate of geometry normal
+
+  float u;             // barycentric u coordinate of hit
+  float v;             // barycentric v coordinate of hit
+
+  unsigned int primID; // primitive ID
+  unsigned int geomID; // geometry ID
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+};
+
+/* Combined ray/hit structure for a single ray */
+struct RTCRayHit
+{
+  struct RTCRay ray;
+  struct RTCHit hit;
+};
+
+/* Ray structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCRay4
+{
+  float org_x[4];
+  float org_y[4];
+  float org_z[4];
+  float tnear[4];
+
+  float dir_x[4];
+  float dir_y[4];
+  float dir_z[4];
+  float time[4];
+
+  float tfar[4];
+  unsigned int mask[4];
+  unsigned int id[4];
+  unsigned int flags[4];
+};
+
+/* Hit structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCHit4
+{
+  float Ng_x[4];
+  float Ng_y[4];
+  float Ng_z[4];
+
+  float u[4];
+  float v[4];
+
+  unsigned int primID[4];
+  unsigned int geomID[4];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+};
+
+/* Combined ray/hit structure for a packet of 4 rays */
+struct RTCRayHit4
+{
+  struct RTCRay4 ray;
+  struct RTCHit4 hit;
+};
+
+/* Ray structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCRay8
+{
+  float org_x[8];
+  float org_y[8];
+  float org_z[8];
+  float tnear[8];
+
+  float dir_x[8];
+  float dir_y[8];
+  float dir_z[8];
+  float time[8];
+
+  float tfar[8];
+  unsigned int mask[8];
+  unsigned int id[8];
+  unsigned int flags[8];
+};
+
+/* Hit structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCHit8
+{
+  float Ng_x[8];
+  float Ng_y[8];
+  float Ng_z[8];
+
+  float u[8];
+  float v[8];
+
+  unsigned int primID[8];
+  unsigned int geomID[8];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+};
+
+/* Combined ray/hit structure for a packet of 8 rays */
+struct RTCRayHit8
+{
+  struct RTCRay8 ray;
+  struct RTCHit8 hit;
+};
+
+/* Ray structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCRay16
+{
+  float org_x[16];
+  float org_y[16];
+  float org_z[16];
+  float tnear[16];
+
+  float dir_x[16];
+  float dir_y[16];
+  float dir_z[16];
+  float time[16];
+
+  float tfar[16];
+  unsigned int mask[16];
+  unsigned int id[16];
+  unsigned int flags[16];
+};
+
+/* Hit structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCHit16
+{
+  float Ng_x[16];
+  float Ng_y[16];
+  float Ng_z[16];
+
+  float u[16];
+  float v[16];
+
+  unsigned int primID[16];
+  unsigned int geomID[16];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+};
+
+/* Combined ray/hit structure for a packet of 16 rays */
+struct RTCRayHit16
+{
+  struct RTCRay16 ray;
+  struct RTCHit16 hit;
+};
+
+/* Ray structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayNp
+{
+  float* org_x;
+  float* org_y;
+  float* org_z;
+  float* tnear;
+
+  float* dir_x;
+  float* dir_y;
+  float* dir_z;
+  float* time;
+
+  float* tfar;
+  unsigned int* mask;
+  unsigned int* id;
+  unsigned int* flags;
+};
+
+/* Hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCHitNp
+{
+  float* Ng_x;
+  float* Ng_y;
+  float* Ng_z;
+
+  float* u;
+  float* v;
+
+  unsigned int* primID;
+  unsigned int* geomID;
+  unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+};
+
+/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayHitNp
+{
+  struct RTCRayNp ray;
+  struct RTCHitNp hit;
+};
+
+struct RTCRayN;
+struct RTCHitN;
+struct RTCRayHitN;
+
+#if defined(__cplusplus)
+
+/* Helper functions to access ray packets of runtime size N */
+RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; }
+
+RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; }
+
+RTC_FORCEINLINE float&        RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_id   (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; }
+
+/* Helper functions to access hit packets of runtime size N */
+RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; }
+
+RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; }
+
+RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; }
+
+/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */
+RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; }
+RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; }
+
+/* Helper structure for a ray packet of compile-time size N */
+template<int N>
+struct RTCRayNt
+{
+  float org_x[N];
+  float org_y[N];
+  float org_z[N];
+  float tnear[N];
+
+  float dir_x[N];
+  float dir_y[N];
+  float dir_z[N];
+  float time[N];
+
+  float tfar[N];
+  unsigned int mask[N];
+  unsigned int id[N];
+  unsigned int flags[N];
+};
+
+/* Helper structure for a hit packet of compile-time size N */
+template<int N>
+struct RTCHitNt
+{
+  float Ng_x[N];
+  float Ng_y[N];
+  float Ng_z[N];
+
+  float u[N];
+  float v[N];
+
+  unsigned int primID[N];
+  unsigned int geomID[N];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+};
+
+/* Helper structure for a combined ray/hit packet of compile-time size N */
+template<int N>
+struct RTCRayHitNt
+{
+  RTCRayNt<N> ray;
+  RTCHitNt<N> hit;
+};
+
+RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i)
+{
+  RTCRay ray;
+  ray.org_x = RTCRayN_org_x(rayN,N,i);
+  ray.org_y = RTCRayN_org_y(rayN,N,i);
+  ray.org_z = RTCRayN_org_z(rayN,N,i);
+  ray.tnear = RTCRayN_tnear(rayN,N,i);
+  ray.dir_x = RTCRayN_dir_x(rayN,N,i);
+  ray.dir_y = RTCRayN_dir_y(rayN,N,i);
+  ray.dir_z = RTCRayN_dir_z(rayN,N,i);
+  ray.time  = RTCRayN_time(rayN,N,i);
+  ray.tfar  = RTCRayN_tfar(rayN,N,i);
+  ray.mask  = RTCRayN_mask(rayN,N,i);
+  ray.id    = RTCRayN_id(rayN,N,i);
+  ray.flags = RTCRayN_flags(rayN,N,i);
+  return ray;
+}
+
+RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i)
+{
+  RTCHit hit;
+  hit.Ng_x   = RTCHitN_Ng_x(hitN,N,i);
+  hit.Ng_y   = RTCHitN_Ng_y(hitN,N,i);
+  hit.Ng_z   = RTCHitN_Ng_z(hitN,N,i);
+  hit.u      = RTCHitN_u(hitN,N,i);
+  hit.v      = RTCHitN_v(hitN,N,i);
+  hit.primID = RTCHitN_primID(hitN,N,i);
+  hit.geomID = RTCHitN_geomID(hitN,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    hit.instID[l] = RTCHitN_instID(hitN,N,i,l);
+  return hit;
+}
+
+RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i)
+{
+  RTCHitN_Ng_x(hitN,N,i)   = hit->Ng_x;
+  RTCHitN_Ng_y(hitN,N,i)   = hit->Ng_y;
+  RTCHitN_Ng_z(hitN,N,i)   = hit->Ng_z;
+  RTCHitN_u(hitN,N,i)      = hit->u;
+  RTCHitN_v(hitN,N,i)      = hit->v;
+  RTCHitN_primID(hitN,N,i) = hit->primID;
+  RTCHitN_geomID(hitN,N,i) = hit->geomID;
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    RTCHitN_instID(hitN,N,i,l) = hit->instID[l];
+}
+
+RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i)
+{
+  RTCRayHit rh;
+
+  RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N);
+  rh.ray.org_x = RTCRayN_org_x(ray,N,i);
+  rh.ray.org_y = RTCRayN_org_y(ray,N,i);
+  rh.ray.org_z = RTCRayN_org_z(ray,N,i);
+  rh.ray.tnear = RTCRayN_tnear(ray,N,i);
+  rh.ray.dir_x = RTCRayN_dir_x(ray,N,i);
+  rh.ray.dir_y = RTCRayN_dir_y(ray,N,i);
+  rh.ray.dir_z = RTCRayN_dir_z(ray,N,i);
+  rh.ray.time  = RTCRayN_time(ray,N,i);
+  rh.ray.tfar  = RTCRayN_tfar(ray,N,i);
+  rh.ray.mask  = RTCRayN_mask(ray,N,i);
+  rh.ray.id    = RTCRayN_id(ray,N,i);
+  rh.ray.flags = RTCRayN_flags(ray,N,i);
+
+  RTCHitN* hit  = RTCRayHitN_HitN(rayhitN,N);
+  rh.hit.Ng_x   = RTCHitN_Ng_x(hit,N,i);
+  rh.hit.Ng_y   = RTCHitN_Ng_y(hit,N,i);
+  rh.hit.Ng_z   = RTCHitN_Ng_z(hit,N,i);
+  rh.hit.u      = RTCHitN_u(hit,N,i);
+  rh.hit.v      = RTCHitN_v(hit,N,i);
+  rh.hit.primID = RTCHitN_primID(hit,N,i);
+  rh.hit.geomID = RTCHitN_geomID(hit,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l);
+
+  return rh;
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h
new file mode 100644
index 0000000000..0cd6401593
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Forward declarations for ray structures */
+struct RTCRayHit;
+struct RTCRayHit4;
+struct RTCRayHit8;
+struct RTCRayHit16;
+struct RTCRayHitNp;
+
+/* Scene flags */
+enum RTCSceneFlags
+{
+  RTC_SCENE_FLAG_NONE                    = 0,
+  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
+  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
+  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
+  RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3)
+};
+
+/* Creates a new scene. */
+RTC_API RTCScene rtcNewScene(RTCDevice device);
+
+/* Returns the device the scene got created in. The reference count of
+ * the device is incremented by this function. */
+RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
+   
+/* Retains the scene (increments the reference count). */
+RTC_API void rtcRetainScene(RTCScene scene);
+
+/* Releases the scene (decrements the reference count). */
+RTC_API void rtcReleaseScene(RTCScene scene);
+
+
+/* Attaches the geometry to a scene. */
+RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
+
+/* Attaches the geometry to a scene using the specified geometry ID. */
+RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
+
+/* Detaches the geometry from the scene. */
+RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. */
+RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
+
+
+/* Commits the scene. */
+RTC_API void rtcCommitScene(RTCScene scene);
+
+/* Commits the scene from multiple threads. */
+RTC_API void rtcJoinCommitScene(RTCScene scene);
+
+
+/* Progress monitor callback function */
+typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
+
+/* Sets the progress monitor callback function of the scene. */
+RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
+
+/* Sets the build quality of the scene. */
+RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
+
+/* Sets the scene flags. */
+RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
+
+/* Returns the scene flags. */
+RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
+
+/* Returns the axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
+
+/* Returns the linear axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
+
+
+/* Perform a closest point query of the scene. */
+RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Intersects a single ray with the scene. */
+RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit);
+
+/* Intersects a packet of 4 rays with the scene. */
+RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit);
+
+/* Intersects a packet of 8 rays with the scene. */
+RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit);
+
+/* Intersects a packet of 16 rays with the scene. */
+RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit);
+
+/* Intersects a stream of M rays with the scene. */
+RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of pointers to M rays with the scene. */
+RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N);
+
+/* Tests a single ray for occlusion with the scene. */
+RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray);
+
+/* Tests a packet of 4 rays for occlusion occluded with the scene. */
+RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray);
+
+/* Tests a packet of 8 rays for occlusion with the scene. */
+RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray);
+
+/* Tests a packet of 16 rays for occlusion with the scene. */
+RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray);
+
+/* Tests a stream of M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride);
+
+/* Tests a stream of pointers to M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N);
+
+/*! collision callback */
+struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
+typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
+
+/*! Performs collision detection of two scenes */
+RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
+ 
+#if defined(__cplusplus)
+
+/* Helper for easily combining scene flags */
+inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
+  return (RTCSceneFlags)((size_t)a | (size_t)b);
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..755ce255fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHair
+    {
+      /*! settings for builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t finished_range_threshold;  //!< finished range threshold
+      };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          friend struct BVHBuilderHair;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+          typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+          typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          static const size_t travCostAligned = 1;
+          static const size_t travCostUnaligned = 5;
+          static const size_t intCost = 6;
+
+          BuilderT (Scene* scene,
+                    PrimRef* prims,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeFunc& createAABBNode,
+                    const SetAABBNodeFunc& setAABBNode,
+                    const CreateOBBNodeFunc& createOBBNode,
+                    const SetOBBNodeFunc& setOBBNode,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const ReportFinishedRangeFunc& reportFinishedRange,
+                    const Settings settings)
+
+            : cfg(settings),
+            prims(prims),
+            createAlloc(createAlloc),
+            createAABBNode(createAABBNode),
+            setAABBNode(setAABBNode),
+            createOBBNode(createOBBNode),
+            setOBBNode(setOBBNode),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            reportFinishedRange(reportFinishedRange),
+            alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const PrimInfoRange& range)
+          {
+            if (range.size() == 0) return true;
+            unsigned int firstGeomID = prims[range.begin()].geomID();
+            for (size_t i=range.begin()+1; i<range.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+              return createLeaf(prims,pinfo,alloc);
+
+            /* fill all children by always splitting the largest one */
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+            unsigned numChildren = 1;
+            children[0] = pinfo;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              __aligned(64) PrimInfoRange left, right;
+              if (!sameGeometry(children[bestChild])) {
+                alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+              } else {
+                alignedHeuristic.splitFallback(children[bestChild],left,right);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* create node */
+            auto node = createAABBNode(alloc);
+
+            for (size_t i=0; i<numChildren; i++) {
+              const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+              setAABBNode(node,i,child,children[i].geomBounds);
+            }
+
+            return node;
+          }
+
+          /*! performs split */
+          __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+            const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+            /* try standard binning in aligned space */
+            float alignedObjectSAH = inf;
+            HeuristicBinningSAH::Split alignedObjectSplit;
+            if (aligned) {
+              alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+              alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+              bestSAH = min(alignedObjectSAH,bestSAH);
+            }
+
+            /* try standard binning in unaligned space */
+            UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (bestSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+              const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* try splitting into two strands */
+            HeuristicStrandSplitSAH::Split strandSplit;
+            float strandSAH = inf;
+            if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+              strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+              strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+              bestSAH = min(strandSAH,bestSAH);
+            }
+
+            /* fallback if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH)))
+            {
+              alignedHeuristic.deterministic_order(pinfo);
+              alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+            }
+
+            /* perform aligned split if this is best */
+            else if (bestSAH == alignedObjectSAH) {
+              alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+            }
+
+            /* perform unaligned split if this is best */
+            else if (bestSAH == unalignedObjectSAH) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* perform strand split if this is best */
+            else if (bestSAH == strandSAH) {
+              strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* can never happen */
+            else
+              assert(false);
+          }
+
+          /*! recursive build */
+          NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(pinfo.size());
+
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+            /* create leaf node */
+            if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+              alignedHeuristic.deterministic_order(pinfo);
+              return createLargeLeaf(depth,pinfo,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            size_t numChildren = 1;
+            children[0] = pinfo;
+            bool aligned = true;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                if (area(children[i].geomBounds) > bestArea) {
+                  bestArea = area(children[i].geomBounds);
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              PrimInfoRange left, right;
+              split(children[bestChild],left,right,aligned);
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            NodeRef node;
+
+            /* create aligned node */
+            if (aligned)
+            {
+              node = createAABBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else {
+                for (size_t i=0; i<numChildren; i++) {
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+                }
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              node = createOBBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                      const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                      const OBBox3fa obounds(space,sinfo.geomBounds);
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<numChildren; i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                  const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                  const OBBox3fa obounds(space,sinfo.geomBounds);
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+                }
+              }
+            }
+
+            /* reports a finished range of primrefs */
+            if (unlikely(alloc_barrier))
+              reportFinishedRange(pinfo);
+
+            return node;
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeFunc& createAABBNode;
+          const SetAABBNodeFunc& setAABBNode;
+          const CreateOBBNodeFunc& createOBBNode;
+          const SetOBBNodeFunc& setOBBNode;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+          const ReportFinishedRangeFunc& reportFinishedRange;
+
+        private:
+          HeuristicBinningSAH alignedHeuristic;
+          UnalignedHeuristicBinningSAH unalignedHeuristic;
+          HeuristicStrandSplitSAH strandHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        static NodeRef build (const CreateAllocFunc& createAlloc,
+                              const CreateAABBNodeFunc& createAABBNode,
+                              const SetAABBNodeFunc& setAABBNode,
+                              const CreateOBBNodeFunc& createOBBNode,
+                              const SetOBBNodeFunc& setOBBNode,
+                              const CreateLeafFunc& createLeaf,
+                              const ProgressMonitor& progressMonitor,
+                              const ReportFinishedRangeFunc& reportFinishedRange,
+                              Scene* scene,
+                              PrimRef* prims,
+                              const PrimInfo& pinfo,
+                              const Settings settings)
+        {
+          typedef BuilderT<NodeRef,
+            CreateAllocFunc,
+            CreateAABBNodeFunc,SetAABBNodeFunc,
+            CreateOBBNodeFunc,SetOBBNodeFunc,
+            CreateLeafFunc,ProgressMonitor,
+            ReportFinishedRangeFunc> Builder;
+
+          Builder builder(scene,prims,createAlloc,
+                          createAABBNode,setAABBNode,
+                          createOBBNode,setOBBNode,
+                          createLeaf,progressMonitor,reportFinishedRange,settings);
+
+          NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..92be2f7e65
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderMorton
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
+
+      /*! settings for morton builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+        : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      /*! Build primitive consisting of morton code and primitive ID. */
+      struct __aligned(8) BuildPrim
+      {
+        union {
+          struct {
+            unsigned int code;     //!< morton code
+            unsigned int index;    //!< i'th primitive
+          };
+          uint64_t t;
+        };
+
+        /*! interface for radix sort */
+        __forceinline operator unsigned() const { return code; }
+
+        /*! interface for standard sort */
+        __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+      };
+
+      /*! maps bounding box to morton code */
+      struct MortonCodeMapping
+      {
+        static const size_t LATTICE_BITS_PER_DIM = 10;
+        static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+        vfloat4 base;
+        vfloat4 scale;
+
+        __forceinline MortonCodeMapping(const BBox3fa& bounds)
+        {
+          base  = (vfloat4)bounds.lower;
+          const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+          scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+        }
+
+        __forceinline const vint4 bin (const BBox3fa& box) const
+        {
+          const vfloat4 lower = (vfloat4)box.lower;
+          const vfloat4 upper = (vfloat4)box.upper;
+          const vfloat4 centroid = lower+upper;
+          return vint4((centroid-base)*scale);
+        }
+
+        __forceinline unsigned int code (const BBox3fa& box) const
+        {
+          const vint4 binID = bin(box);
+          const unsigned int x = extract<0>(binID);
+          const unsigned int y = extract<1>(binID);
+          const unsigned int z = extract<2>(binID);
+          const unsigned int xyz = bitInterleave(x,y,z);
+          return xyz;
+        }
+      };
+
+#if defined (__AVX2__)
+
+      /*! for AVX2 there is a fast scalar bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest) {}
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          dest->index = index;
+          dest->code = mapping.code(b);
+          dest++;
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+      };
+
+#else
+
+      /*! before AVX2 is it better to use the SSE version of bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+        __forceinline ~MortonCodeGenerator()
+        {
+          if (slots != 0)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            for (size_t i=0; i<slots; i++) {
+              dest[currentID-slots+i].index = ai[i];
+              dest[currentID-slots+i].code = code[i];
+            }
+          }
+        }
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          const vint4 binID = mapping.bin(b);
+          ax[slots] = extract<0>(binID);
+          ay[slots] = extract<1>(binID);
+          az[slots] = extract<2>(binID);
+          ai[slots] = index;
+          slots++;
+          currentID++;
+
+          if (slots == 4)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+            vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+            slots = 0;
+          }
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+        size_t slots;
+        vint4 ax, ay, az, ai;
+      };
+
+#endif
+
+      template<
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocator,
+        typename CreateNodeFunc,
+        typename SetNodeBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBounds,
+        typename ProgressMonitor>
+
+        class BuilderT : private Settings
+      {
+        ALIGNED_CLASS_(16);
+
+      public:
+
+        BuilderT (CreateAllocator& createAllocator,
+                  CreateNodeFunc& createNode,
+                  SetNodeBoundsFunc& setBounds,
+                  CreateLeafFunc& createLeaf,
+                  CalculateBounds& calculateBounds,
+                  ProgressMonitor& progressMonitor,
+                  const Settings& settings)
+
+          : Settings(settings),
+          createAllocator(createAllocator),
+          createNode(createNode),
+          setBounds(setBounds),
+          createLeaf(createLeaf),
+          calculateBounds(calculateBounds),
+          progressMonitor(progressMonitor),
+          morton(nullptr) {}
+
+        ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+        {
+          /* this should never occur but is a fatal error */
+          if (depth > maxDepth)
+            throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+          /* create leaf for few primitives */
+          if (current.size() <= maxLeafSize)
+            return createLeaf(current,alloc);
+
+          /* fill all children by always splitting the largest one */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          size_t numChildren = 1;
+          children[0] = current;
+
+          do {
+
+            /* find best child with largest number of primitives */
+            size_t bestChild = -1;
+            size_t bestSize = 0;
+            for (size_t i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= maxLeafSize)
+                continue;
+
+              /* remember child with largest size */
+              if (children[i].size() > bestSize) {
+                bestSize = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == size_t(-1)) break;
+
+            /*! split best child into left and right child */
+            auto split = children[bestChild].split();
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = split.first;
+            children[numChildren+0] = split.second;
+            numChildren++;
+
+          } while (numChildren < branchingFactor);
+
+          /* create node */
+          auto node = createNode(alloc,numChildren);
+
+          /* recurse into each child */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<numChildren; i++)
+            bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /*! recreates morton codes when reaching a region where all codes are identical */
+        __noinline void recreateMortonCodes(const range<unsigned>& current) const
+        {
+          /* fast path for small ranges */
+          if (likely(current.size() < 1024))
+          {
+            /*! recalculate centroid bounds */
+            BBox3fa centBounds(empty);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              centBounds.extend(center2(calculateBounds(morton[i])));
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+            /* sort morton codes */
+            std::sort(morton+current.begin(),morton+current.end());
+          }
+          else
+          {
+            /*! recalculate centroid bounds */
+            auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+              BBox3fa centBounds = empty;
+              for (size_t i=r.begin(); i<r.end(); i++)
+                centBounds.extend(center2(calculateBounds(morton[i])));
+              return centBounds;
+            };
+            const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+                                                       BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  morton[i].code = mapping.code(calculateBounds(morton[i]));
+                }
+              });
+
+            /*! sort morton codes */
+#if defined(TASKING_TBB)
+            tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+            radixsort32(morton+current.begin(),current.size());
+#endif
+          }
+        }
+
+        __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+        {
+          const unsigned int code_start = morton[current.begin()].code;
+          const unsigned int code_end   = morton[current.end()-1].code;
+          unsigned int bitpos = lzcnt(code_start^code_end);
+
+          /* if all items mapped to same morton code, then re-create new morton codes for the items */
+          if (unlikely(bitpos == 32))
+          {
+            recreateMortonCodes(current);
+            const unsigned int code_start = morton[current.begin()].code;
+            const unsigned int code_end   = morton[current.end()-1].code;
+            bitpos = lzcnt(code_start^code_end);
+
+            /* if the morton code is still the same, goto fall back split */
+            if (unlikely(bitpos == 32)) {
+              current.split(left,right);
+              return;
+            }
+          }
+
+          /* split the items at the topmost different morton code bit */
+          const unsigned int bitpos_diff = 31-bitpos;
+          const unsigned int bitmask = 1 << bitpos_diff;
+
+          /* find location where bit differs using binary search */
+          unsigned begin = current.begin();
+          unsigned end   = current.end();
+          while (begin + 1 != end) {
+            const unsigned mid = (begin+end)/2;
+            const unsigned bit = morton[mid].code & bitmask;
+            if (bit == 0) begin = mid; else end = mid;
+          }
+          unsigned center = end;
+#if defined(DEBUG)
+          for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
+          for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+          left = make_range(current.begin(),center);
+          right = make_range(center,current.end());
+        }
+
+        ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+        {
+          /* get thread local allocator */
+          if (!alloc)
+            alloc = createAllocator();
+
+          /* call memory monitor function to signal progress */
+          if (toplevel && current.size() <= singleThreadThreshold)
+            progressMonitor(current.size());
+
+          /* create leaf node */
+          if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+            return createLargeLeaf(depth,current,alloc);
+
+          /* fill all children by always splitting the one with the largest surface area */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          split(current,children[0],children[1]);
+          size_t numChildren = 2;
+
+          while (numChildren < branchingFactor)
+          {
+            /* find best child with largest number of primitives */
+            int bestChild = -1;
+            unsigned bestItems = 0;
+            for (unsigned int i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= minLeafSize)
+                continue;
+
+              /* remember child with largest area */
+              if (children[i].size() > bestItems) {
+                bestItems = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == -1) break;
+
+            /*! split best child into left and right child */
+            range<unsigned> left, right;
+            split(children[bestChild],left,right);
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = left;
+            children[numChildren+0] = right;
+            numChildren++;
+          }
+
+          /* create leaf node if no split is possible */
+          if (unlikely(numChildren == 1))
+            return createLeaf(current,alloc);
+
+          /* allocate node */
+          auto node = createNode(alloc,numChildren);
+
+          /* process top parts of tree parallel */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          if (current.size() > singleThreadThreshold)
+          {
+            /*! parallel_for is faster than spawing sub-tasks */
+            parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  bounds[i] = recurse(depth+1,children[i],nullptr,true);
+                  _mm_mfence(); // to allow non-temporal stores during build
+                }
+              });
+          }
+
+          /* finish tree sequentially */
+          else
+          {
+            for (size_t i=0; i<numChildren; i++)
+              bounds[i] = recurse(depth+1,children[i],alloc,false);
+          }
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /* build function */
+        ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+        {
+          /* sort morton codes */
+          morton = src;
+          radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+          /* build BVH */
+          const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+
+      public:
+        CreateAllocator& createAllocator;
+        CreateNodeFunc& createNode;
+        SetNodeBoundsFunc& setBounds;
+        CreateLeafFunc& createLeaf;
+        CalculateBounds& calculateBounds;
+        ProgressMonitor& progressMonitor;
+
+      public:
+        BuildPrim* morton;
+      };
+
+
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBoundsFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAllocator,
+                                 CreateNodeFunc createNode,
+                                 SetBoundsFunc setBounds,
+                                 CreateLeafFunc createLeaf,
+                                 CalculateBoundsFunc calculateBounds,
+                                 ProgressMonitor progressMonitor,
+                                 BuildPrim* src,
+                                 BuildPrim* tmp,
+                                 size_t numPrimitives,
+                                 const Settings& settings)
+        {
+          typedef BuilderT<
+            ReductionTy,
+            decltype(createAllocator()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetBoundsFunc,
+            CreateLeafFunc,
+            CalculateBoundsFunc,
+            ProgressMonitor> Builder;
+
+          Builder builder(createAllocator,
+                          createNode,
+                          setBounds,
+                          createLeaf,
+                          calculateBounds,
+                          progressMonitor,
+                          settings);
+
+          return builder.build(src,tmp,numPrimitives);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..4c138dacdb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS   32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      struct SharedVector
+      {
+        __forceinline SharedVector() {}
+
+        __forceinline SharedVector(T* ptr, size_t refCount = 1)
+          : prims(ptr), refCount(refCount) {}
+
+        __forceinline void incRef() {
+          refCount++;
+        }
+
+        __forceinline void decRef()
+        {
+          if (--refCount == 0)
+            delete prims;
+        }
+
+        T* prims;
+        size_t refCount;
+      };
+
+    template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+      struct LocalChildListT
+      {
+        typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+        __forceinline LocalChildListT (const BuildRecord& record)
+          : numChildren(1), numSharedPrimVecs(1)
+        {
+          /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+          children[0] = record;
+          primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+        }
+
+        __forceinline ~LocalChildListT()
+        {
+          for (size_t i = 0; i < numChildren; i++)
+            primvecs[i]->decRef();
+        }
+
+        __forceinline BuildRecord& operator[] ( const size_t i ) {
+          return children[i];
+        }
+
+        __forceinline size_t size() const {
+          return numChildren;
+        }
+
+        __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+        {
+          SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+          if (lrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[bestChild] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+          }
+
+          if (rrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[numChildren] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+          }
+          bsharedPrimVec->decRef();
+          new_vector.release();
+
+          children[bestChild] = lrecord;
+          children[numChildren] = rrecord;
+          numChildren++;
+        }
+
+      public:
+        array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+        array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+        size_t numChildren;
+
+        array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+        size_t numSharedPrimVecs;
+      };
+
+    template<typename Mesh>
+      struct RecalculatePrimRef
+      {
+        Scene* scene;
+
+        __forceinline RecalculatePrimRef (Scene* scene)
+          : scene(scene) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+        }
+      };
+
+    struct VirtualRecalculatePrimRef
+    {
+      Scene* scene;
+      
+      __forceinline VirtualRecalculatePrimRef (Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+        return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+      }
+    };
+
+    struct BVHBuilderMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+          travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+          singleThreadThreshold(1024) {}
+
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        bool singleLeafTimeSegment; //!< split time to single time range
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+          return a.prims.size() < b.prims.size();
+        }
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;                     //!< Depth of the root of this subtree.
+	SetMB prims;                      //!< The list of primitives.
+      };
+
+      struct BuildRecordSplit : public BuildRecord
+      {
+        __forceinline BuildRecordSplit () {}
+
+        __forceinline BuildRecordSplit (size_t depth) 
+          : BuildRecord(depth) {}
+
+        __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+          : BuildRecord(record), split(split) {}
+        
+        BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+      };
+
+      template<
+        typename NodeRef,
+        typename RecalculatePrimRef,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor	  
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;        //!< create balanced tree if we are that many levels before the maximum tree depth
+
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+          typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+          typedef mvector<PrimRefMB>* PrimRefVector;
+          typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+          typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+        public:
+
+          BuilderT (MemoryMonitorInterface* device,
+                    const RecalculatePrimRef recalculatePrimRef,
+                    const CreateAllocFunc createAlloc,
+                    const CreateNodeFunc createNode,
+                    const SetNodeFunc setNode,
+                    const CreateLeafFunc createLeaf,
+                    const ProgressMonitor progressMonitor,
+                    const Settings& settings)
+            : cfg(settings),
+            heuristicObjectSplit(),
+            heuristicTemporalSplit(device, recalculatePrimRef),
+            recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+            progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          /*! finds the best split */
+          const Split find(const SetMB& set)
+          {
+            /* first try standard object split */
+            const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+            const float object_split_sah = object_split.splitSAH();
+
+            /* test temporal splits only when object split was bad */
+            const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+            if (object_split_sah < 0.50f*leaf_sah)
+              return object_split;
+
+            /* do temporal splits only if the the time range is big enough */
+            if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+            {
+              const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+              const float temporal_split_sah = temporal_split.splitSAH();
+
+              /* take temporal split if it improved SAH */
+              if (temporal_split_sah < object_split_sah)
+                return temporal_split;
+            }
+
+            return object_split;
+          }
+
+          /*! array partitioning */
+          __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            /* perform object split */
+            if (likely(split.data == Split::SPLIT_OBJECT)) {
+              heuristicObjectSplit.split(split,set,lset,rset);
+            }
+            /* perform temporal split */
+            else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+              return heuristicTemporalSplit.split(split,set,lset,rset);
+            }
+            /* perform fallback split */
+            else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+              set.deterministic_order();
+              splitFallback(set,lset,rset);
+            }
+            /* split by geometry */
+            else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+              set.deterministic_order();
+              splitByGeometry(set,lset,rset);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! finds the best fallback split */
+          __noinline Split findFallback(const SetMB& set)
+          {
+            /* split if primitives are not from same geometry */
+            if (!sameGeometry(set))
+              return Split(0.0f,Split::SPLIT_GEOMID);
+            
+            /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+            if (cfg.singleLeafTimeSegment)
+            {
+              /* test if one primitive has more than one time segment in time range, if so split time */
+              for (size_t i=set.begin(); i<set.end(); i++)
+              {
+                const PrimRefMB& prim = (*set.prims)[i];
+                const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+                const int localTimeSegments = itime_range.size();
+                assert(localTimeSegments > 0);
+                if (localTimeSegments > 1) {
+                  const int icenter = (itime_range.begin() + itime_range.end())/2;
+                  const float splitTime = prim.timeStep(icenter);
+                  return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+                }
+              }
+            }        
+
+            /* otherwise return fallback split */
+            return Split(0.0f,Split::SPLIT_FALLBACK);
+          }
+
+          /*! performs fallback split */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            if (set.size() == 0) return true;
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            unsigned int firstGeomID = prims[begin].geomID();
+            for (size_t i=begin+1; i<end; i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /* split by geometry ID */
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            
+            PrimInfoMB left(empty);
+            PrimInfoMB right(empty);
+            unsigned int geomID = prims[begin].geomID();
+            size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+            
+            new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (in.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* replace already found split by fallback split */
+            const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+	    
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+              return createLeaf(current,alloc);
+	  
+            /* fill all children by always splitting the largest one */
+            bool hasTimeSplits = false;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildListSplit children(current);
+
+            do {
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+                  continue;
+
+                force_split = false;
+                
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecordSplit& brecord = children[bestChild];
+              BuildRecordSplit lrecord(current.depth+1);
+              BuildRecordSplit rrecord(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+
+              /* find new splits */
+              lrecord.split = findFallback(lrecord.prims);
+              rrecord.split = findFallback(rrecord.prims);
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = in.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create node */
+            auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+            /* recurse into each child and perform reduction */
+            LBBox3fa gbounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              gbounds.extend(values[i].lbounds);
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (hasTimeSplits)
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            const Split csplit = find(current.prims);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+            assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            SetMB lprims,rprims;
+            std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+            bool hasTimeSplits = new_vector != nullptr;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            {
+              BuildRecord lrecord(lprims,current.depth+1);
+              BuildRecord rrecord(rprims,current.depth+1);
+              children.split(0,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /*! split until node is full or SAH tells us to stop */
+            while (children.size() < cfg.branchingFactor) 
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                if (children[i].size() <= cfg.minLeafSize) continue;
+                if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              Split csplit = find(brecord.prims);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* sort buildrecords for simpler shadow ray traversal */
+            //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+            /*! create an inner node */
+            auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+            LBBox3fa gbounds = empty;
+
+            /* spawn tasks */
+            if (unlikely(current.size() > cfg.singleThreadThreshold))
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              /*! merge bounding boxes */
+              for (size_t i=0; i<children.size(); i++)
+                gbounds.extend(values[i].lbounds);
+            }
+            /* recurse into each child */
+            else
+            {
+              //for (size_t i=0; i<children.size(); i++)
+              for (ssize_t i=children.size()-1; i>=0; i--) {
+                values[i] = recurse(children[i],alloc,false);
+                gbounds.extend(values[i].lbounds);
+              }
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (unlikely(hasTimeSplits))
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          /*! builder entry function */
+          __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            const SetMB set(pinfo,&prims);
+            auto ret = recurse(BuildRecord(set,1),nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return ret;
+          }
+
+        private:
+          Settings cfg;
+          HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+          HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+          const RecalculatePrimRef recalculatePrimRef;
+          const CreateAllocFunc createAlloc;
+          const CreateNodeFunc createNode;
+          const SetNodeFunc setNode;
+          const CreateLeafFunc createLeaf;
+          const ProgressMonitor progressMonitor;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitorFunc>
+
+        static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+                                                      const PrimInfoMB& pinfo,
+                                                      MemoryMonitorInterface* device,
+                                                      const RecalculatePrimRef recalculatePrimRef,
+                                                      const CreateAllocFunc createAlloc,
+                                                      const CreateNodeFunc createNode,
+                                                      const SetNodeFunc setNode,
+                                                      const CreateLeafFunc createLeaf,
+                                                      const ProgressMonitorFunc progressMonitor,
+                                                      const Settings& settings)
+      {
+          typedef BuilderT<
+            NodeRef,
+            RecalculatePrimRef,
+            decltype(createAlloc()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetNodeFunc,
+            CreateLeafFunc,
+            ProgressMonitorFunc> Builder;
+
+          Builder builder(device,
+                          recalculatePrimRef,
+                          createAlloc,
+                          createNode,
+                          setNode,
+                          createLeaf,
+                          progressMonitor,
+                          settings);
+
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..e477c313a3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHairMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;       //!< depth of the root of this subtree
+	SetMB prims;        //!< the list of primitives
+      };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+          typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+          typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+          typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+        public:
+
+          BuilderT (Scene* scene,
+                    const RecalculatePrimRef& recalculatePrimRef,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeMBFunc& createAABBNodeMB,
+                    const SetAABBNodeMBFunc& setAABBNodeMB,
+                    const CreateOBBNodeMBFunc& createOBBNodeMB,
+                    const SetOBBNodeMBFunc& setOBBNodeMB,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings settings)
+
+            : cfg(settings),
+            scene(scene),
+            recalculatePrimRef(recalculatePrimRef),
+            createAlloc(createAlloc),
+            createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+            createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            unalignedHeuristic(scene),
+            temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+        private:
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+            unsigned int firstGeomID = prims[set.begin()].geomID();
+            for (size_t i=set.begin()+1; i<set.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+          
+          /*! performs some split if SAH approaches fail */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            PrimInfoMB linfo(empty);
+            PrimInfoMB rinfo(empty);
+            unsigned int geomID = (*set.prims)[begin].geomID();
+            size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+              return createLeaf(current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            LocalChildList children(current);
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+                  continue;
+
+                force_split = false;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!sameGeometry(children[bestChild].prims)) {
+                splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+              } else {
+                splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+              children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+            } while (children.size() < cfg.branchingFactor);
+
+
+            /* detect time_ranges that have shrunken */
+            bool timesplit = false;
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+            
+            /* create node */
+            NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+            LBBox3fa bounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              bounds.extend(values[i].lbounds);
+            }
+
+            setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+            if (timesplit)
+              bounds = current.prims.linearBounds(recalculatePrimRef);
+              
+            return NodeRecordMB4D(node,bounds,current.prims.time_range);
+          }
+
+          /*! performs split */
+          std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+            /* perform standard binning in aligned space */
+            HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+            float alignedObjectSAH = alignedObjectSplit.splitSAH();
+            bestSAH = min(alignedObjectSAH,bestSAH);
+
+            /* perform standard binning in unaligned space */
+            UnalignedHeuristicBinning::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (alignedObjectSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+              const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+            float temporal_split_sah = inf;
+            typename HeuristicTemporal::Split temporal_split;
+            if (bestSAH > 0.5f*leafSAH) {
+              if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+                temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+                temporal_split_sah = temporal_split.splitSAH();
+                bestSAH = min(temporal_split_sah,bestSAH);
+              }
+            }
+
+            /* perform fallback split if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH))) {
+              current.prims.deterministic_order();
+              splitFallback(current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform aligned split if this is best */
+            else if (likely(bestSAH == alignedObjectSAH)) {
+              alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform unaligned split if this is best */
+            else if (likely(bestSAH == unalignedObjectSAH)) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+              aligned = false;
+            }
+            /* perform temporal split if this is best */
+            else if (likely(bestSAH == temporal_split_sah)) {
+              timesplit = true;
+              return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! recursive build */
+          NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(current.size());
+
+            /* create leaf node */
+            if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            bool aligned = true;
+            bool timesplit = false;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                const float A = children[i].prims.halfArea();
+                if (A > bestArea) {
+                  bestArea = children[i].prims.halfArea();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+              children.split(bestChild,left,right,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create time split node */
+            if (timesplit)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequential */
+              else {
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                }
+              }
+
+              setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+
+            /* create aligned node */
+            else if (aligned)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      cbounds[i] = values[i].lbounds;
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++)
+                  bounds.extend(cbounds[i]);
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+              /* ... continue sequentially */
+              else
+              {
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                  bounds.extend(values[i].lbounds);
+                }
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              const NodeRef node = createOBBNodeMB(alloc);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                      const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                      const auto child = recurse(children[i],nullptr,true);
+                      setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<children.size(); i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                  const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                  const auto child = recurse(children[i],alloc,false);
+                  setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                }
+              }
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+          }
+
+        public:
+
+          /*! entry point into builder */
+          NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            BuildRecord record(SetMB(pinfo,&prims),1);
+            auto root = recurse(record,nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return root;
+          }
+
+        private:
+          Settings cfg;
+          Scene* scene;
+          const RecalculatePrimRef& recalculatePrimRef;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeMBFunc& createAABBNodeMB;
+          const SetAABBNodeMBFunc& setAABBNodeMB;
+          const CreateOBBNodeMBFunc& createOBBNodeMB;
+          const SetOBBNodeMBFunc& setOBBNodeMB;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+
+        private:
+          HeuristicBinning alignedHeuristic;
+          UnalignedHeuristicBinning unalignedHeuristic;
+          HeuristicTemporal temporalSplitHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+                                               const RecalculatePrimRef& recalculatePrimRef,
+                                               const CreateAllocFunc& createAlloc,
+                                               const CreateAABBNodeMBFunc& createAABBNodeMB,
+                                               const SetAABBNodeMBFunc& setAABBNodeMB,
+                                               const CreateOBBNodeMBFunc& createOBBNodeMB,
+                                               const SetOBBNodeMBFunc& setOBBNodeMB,
+                                               const CreateLeafFunc& createLeaf,
+                                               const ProgressMonitor& progressMonitor,
+                                               const Settings settings)
+        {
+          typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+            CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+            CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+            CreateLeafFunc,ProgressMonitor> Builder;
+
+          Builder builder(scene,recalculatePrimRef,createAlloc,
+                          createAABBNodeMB,setAABBNodeMB,
+                          createOBBNodeMB,setOBBNodeMB,
+                          createLeaf,progressMonitor,settings);
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..3f7e678a10
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+#  define NUM_OBJECT_BINS 16
+#  define NUM_SPATIAL_BINS 16
+#else
+#  define NUM_OBJECT_BINS 32
+#  define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+  namespace isa
+  {
+    MAYBE_UNUSED static const float travCost = 1.0f;
+    MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    struct GeneralBVHBuilder
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor      
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;       //!< create balanced tree of we are that many levels before the maximum tree depth
+      
+
+      /*! settings for SAH builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(static_cast<size_t>(settings.sahBlockSize));
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost  )) intCost         = settings.intersectionCost;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+        size_t primrefarrayalloc;  //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+      };
+
+      /*! recursive state of builder */
+      template<typename Set, typename Split>
+        struct BuildRecordT
+        {
+        public:
+          __forceinline BuildRecordT () {}
+
+          __forceinline BuildRecordT (size_t depth)
+            : depth(depth), alloc_barrier(false), prims(empty) {}
+
+          __forceinline BuildRecordT (size_t depth, const Set& prims)
+            : depth(depth), alloc_barrier(false), prims(prims) {}
+
+          __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+          __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+          __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size();  }
+
+          __forceinline size_t size() const { return prims.size(); }
+
+        public:
+          size_t depth;       //!< Depth of the root of this subtree.
+          bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+          Set prims;          //!< The list of primitives.
+        };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafFunc
+      {
+        __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+      };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafSplitFunc
+      {
+        __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+      };
+
+      template<typename BuildRecord,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          friend struct GeneralBVHBuilder;
+
+          BuilderT (PrimRef* prims,
+                    Heuristic& heuristic,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateNodeFunc& createNode,
+                    const UpdateNodeFunc& updateNode,
+                    const CreateLeafFunc& createLeaf,
+                    const CanCreateLeafFunc& canCreateLeaf,
+                    const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings& settings) :
+                    cfg(settings),
+                    prims(prims),
+                    heuristic(heuristic),
+                    createAlloc(createAlloc),
+                    createNode(createNode),
+                    updateNode(updateNode),
+                    createLeaf(createLeaf),
+                    canCreateLeaf(canCreateLeaf),
+                    canCreateLeafSplit(canCreateLeafSplit),
+                    progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+              return createLeaf(prims,current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            size_t numChildren = 1;
+            children[0] = current;
+            do {
+
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].prims.size() > bestSize) {
+                  bestSize = children[i].prims.size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == (size_t)-1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!canCreateLeaf(prims,children[bestChild].prims)) {
+                canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+              } else {
+                heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* create node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* recurse into each child  and perform reduction */
+            for (size_t i=0; i<numChildren; i++)
+              values[i] = createLargeLeaf(children[i],alloc);
+
+            /* perform reduction */
+            return updateNode(current,children,node,values,numChildren);
+          }
+
+          const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+            assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              heuristic.deterministic_order(current.prims);
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            Set lprims,rprims;
+            heuristic.split(split,current.prims,lprims,rprims);
+	    
+            /*! initialize child list with initial split */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            children[0] = BuildRecord(current.depth+1,lprims);
+            children[1] = BuildRecord(current.depth+1,rprims);
+            size_t numChildren = 2;
+
+            /*! split until node is full or SAH tells us to stop */
+            while (numChildren < cfg.branchingFactor)
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+                /* find child with largest surface area */
+                if (halfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i;
+                  bestArea = halfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+              heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+              children[bestChild  ] = lrecord;
+              children[numChildren] = rrecord;
+              numChildren++;
+            }
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* sort buildrecords for faster shadow ray traversal */
+            std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+            /*! create an inner node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* spawn tasks */
+            if (current.size() > cfg.singleThreadThreshold)
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+            /* recurse into each child */
+            else
+            {
+              for (size_t i=0; i<numChildren; i++)
+                values[i] = recurse(children[i],alloc,false);
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          Heuristic& heuristic;
+          const CreateAllocFunc& createAlloc;
+          const CreateNodeFunc& createNode;
+          const UpdateNodeFunc& updateNode;
+          const CreateLeafFunc& createLeaf;
+          const CanCreateLeafFunc& canCreateLeaf;
+          const CanCreateLeafSplitFunc& canCreateLeafSplit;
+          const ProgressMonitor& progressMonitor;
+        };
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          DefaultCanCreateLeafFunc<PrimRef, Set>,
+          DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        DefaultCanCreateLeafFunc<PrimRef, Set>(),
+                        DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const CanCreateLeafFunc& canCreateLeaf,
+                                            const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          CanCreateLeafFunc,
+          CanCreateLeafSplitFunc,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        canCreateLeaf,
+                        canCreateLeafSplit,
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+    };
+
+    /* SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedSAH
+    {
+      typedef PrimInfoRange Set;
+      typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const CanCreateLeafFunc& canCreateLeaf,
+                                 const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          canCreateLeaf,
+          canCreateLeafSplit,
+          progressMonitor,
+          settings);
+      }
+    };
+
+    /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+    struct BVHBuilderBinnedFastSpatialSAH
+    {
+      typedef PrimInfoExtRange Set;
+      typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      static const unsigned int GEOMID_MASK = 0xFFFFFFFF >>     RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+      static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+      template<typename ReductionTy, typename UserCreateLeaf>
+      struct CreateLeafExt
+      {
+        __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+          : userCreateLeaf(userCreateLeaf) {}
+
+        // __noinline is workaround for ICC2016 compiler bug
+        template<typename Allocator>
+        __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+        {
+          for (size_t i=range.begin(); i<range.end(); i++)
+            prims[i].lower.u &= GEOMID_MASK;
+
+          return userCreateLeaf(prims,range,alloc);
+        }
+
+        const UserCreateLeaf userCreateLeaf;
+      };
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename SplitPrimitiveFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode,
+                                 UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 SplitPrimitiveFunc splitPrimitive,
+                                 ProgressMonitor progressMonitor,
+                                 PrimRef* prims,
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo,
+                                 const Settings& settings)
+        {
+          typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+          Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+          /* calculate total surface area */ // FIXME: this sum is not deterministic
+          const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+              double A = 0.0f;
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                A += area(prim.bounds());
+              }
+              return A;
+            },std::plus<double>());
+
+
+          /* calculate maximum number of spatial splits per primitive */
+          const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+          const float f = 10.0f;
+
+          const float invA = 1.0f / A;
+          parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                assert((prim.geomID() & SPLITS_MASK) == 0);
+                // FIXME: is there a better general heuristic ?
+                const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+                unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+                prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+              }
+            });
+
+          return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+            heuristic,
+            prims,
+            PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+            createAlloc,
+            createNode,
+            updateNode,
+            CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+            progressMonitor,
+            settings);
+        }
+    };
+
+    /* Open/Merge SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedOpenMergeSAH
+    {
+      static const size_t NUM_OBJECT_BINS_HQ = 32;
+      typedef PrimInfoExtRange Set;
+      typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+      
+      /*! special builder that propagates reduction over the tree */
+      template<
+        typename ReductionTy, 
+        typename BuildRef,
+        typename CreateAllocFunc, 
+        typename CreateNodeFunc, 
+        typename UpdateNodeFunc, 
+        typename CreateLeafFunc, 
+        typename NodeOpenerFunc, 
+        typename ProgressMonitor>
+        
+        static ReductionTy build(CreateAllocFunc createAlloc, 
+                                 CreateNodeFunc createNode, 
+                                 UpdateNodeFunc updateNode, 
+                                 const CreateLeafFunc& createLeaf, 
+                                 NodeOpenerFunc nodeOpenerFunc,
+                                 ProgressMonitor progressMonitor,
+                                 BuildRef* prims, 
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo, 
+                                 const Settings& settings)
+      {
+        typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+        Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+          heuristic,
+          prims,
+          PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..a4d3b68e46
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
@@ -0,0 +1,972 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct BinMapping
+      {
+      public:
+        __forceinline BinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline BinMapping(size_t N, const BBox3fa& centBounds) 
+        {
+          num = min(BINS,size_t(4.0f + 0.05f*N));
+          assert(num >= 1);
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        __forceinline BinMapping(const BBox3fa& centBounds) 
+        {
+          num = BINS;
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        template<typename PrimInfo>
+        __forceinline BinMapping(const PrimInfo& pinfo) 
+        {
+          const vfloat4 eps = 1E-34f;
+          num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) pinfo.centBounds.lower;
+        }
+
+        /*! returns number of bins */
+        __forceinline size_t size() const { return num; }
+        
+        /*! slower but safe binning */
+        __forceinline Vec3ia bin(const Vec3fa& p) const 
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+          assert(i[0] >= 0 && (size_t)i[0] < num); 
+          assert(i[1] >= 0 && (size_t)i[1] < num);
+          assert(i[2] >= 0 && (size_t)i[2] < num);
+          return Vec3ia(i);
+#else
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+        }
+
+        /*! faster but unsafe binning */
+        __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+          return bin_unsafe(p.binCenter());
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef, typename BinBoundsAndCenter>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+          return bin_unsafe(binBoundsAndCenter.binCenter(p));
+        }
+
+        template<typename PrimRef>
+        __forceinline bool bin_unsafe(const PrimRef& ref,
+                                      const vint4&   vSplitPos,
+                                      const vbool4&  splitDimMask) const // FIXME: rename to isLeft
+        {
+          return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+        }
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+        }
+
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+          return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+        }
+        
+      public:
+        size_t num;
+        vfloat4 ofs,scale;        //!< linear function that maps to bin ID
+      };
+    
+    /*! stores all information to perform some split */
+    template<size_t BINS>
+      struct BinSplit
+      {
+        enum
+        {
+          SPLIT_OBJECT   = 0,
+          SPLIT_FALLBACK = 1,
+          SPLIT_ENFORCE  = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+          SPLIT_TEMPORAL = 2,
+          SPLIT_GEOMID   = 3,
+        };
+
+        /*! construct an invalid split by default */
+        __forceinline BinSplit()
+          : sah(inf), dim(-1), pos(0), data(0) {}
+
+        __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+          : sah(sah), dim(dim), fpos(fpos), data(data) {}
+        
+        /*! constructs specified split */
+        __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+          return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+        }
+        
+      public:
+        float sah;                //!< SAH cost of the split
+        int dim;                  //!< split dimension
+        union { int pos; float fpos; };                  //!< bin index for splitting
+        unsigned int data;        //!< extra optional split data
+        BinMapping<BINS> mapping; //!< mapping into bins
+      };
+    
+    /*! stores extended information about the split */
+    template<typename BBox>
+      struct SplitInfoT
+    {
+
+      __forceinline SplitInfoT () {}
+      
+      __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+	: leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+      
+    public:
+      size_t leftCount,rightCount;
+      BBox leftBounds,rightBounds;
+    };
+
+    typedef SplitInfoT<BBox3fa> SplitInfo;
+    typedef SplitInfoT<LBBox3fa> SplitInfo2;
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef, typename BBox>
+      struct __aligned(64) BinInfoT
+    {		  
+      typedef BinSplit<BINS> Split;
+      typedef vbool4 vbool;
+      typedef vint4 vint;
+      typedef vfloat4 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+
+      /*! bin access function */
+      __forceinline BBox &bounds(const size_t binID, const size_t dimID)             { return _bounds[binID][dimID]; }
+      __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+      __forceinline unsigned int &counts(const size_t binID, const size_t dimID)             { return _counts[binID][dimID]; }
+      __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+      __forceinline vuint4 &counts(const size_t binID)             { return _counts[binID]; }
+      __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+	for (size_t i=0; i<BINS; i++) {
+	  bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+	  counts(i) = vuint4(zero);
+	}
+      }
+      
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+      {
+	if (unlikely(N == 0)) return;
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i+0].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          BBox prim1; Vec3fa center1;
+          prims[i+1].binBoundsAndCenter(prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); 
+          const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); 
+          const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); 
+          const unsigned int s0 = (unsigned int)prims[i+0].size();
+          counts(b00,0)+=s0;
+          counts(b01,1)+=s0;
+          counts(b02,2)+=s0;
+
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int b10 = extract<0>(bin1);  bounds(b10,0).extend(prim1); 
+          const unsigned int b11 = extract<1>(bin1);  bounds(b11,1).extend(prim1); 
+          const unsigned int b12 = extract<2>(bin1);  bounds(b12,2).extend(prim1); 
+          const unsigned int s1 = (unsigned int)prims[i+1].size();
+          counts(b10,0)+=s1;
+          counts(b11,1)+=s1;
+          counts(b12,2)+=s1;
+        }
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = (unsigned int)prims[i].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+
+      /*! bins an array of primitives */
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+      {
+	if (N == 0) return;
+        
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+          
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int s1 = prims[i+1].size();
+          const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+          const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+          const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+        }
+	
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+      
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+	bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+		
+	for (size_t i=0; i<numBins; i++) 
+        {
+          counts(i) += other.counts(i);
+          bounds(i,0).extend(other.bounds(i,0));
+          bounds(i,1).extend(other.bounds(i,1));
+          bounds(i,2).extend(other.bounds(i,2));
+        }
+      }
+
+      /*! reduces binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<numBins; i++) 
+        {
+          c.counts(i) = a.counts(i)+b.counts(i);
+          c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+          c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+          c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = (1 << blocks_shift)-1;
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
+      
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+	  return;
+	}
+	
+	size_t leftCount = 0;
+	BBox leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += counts(i,split.dim);
+	  leftBounds.extend(bounds(i,split.dim));
+	}
+	size_t rightCount = 0;
+	BBox rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += counts(i,split.dim);
+	  rightBounds.extend(bounds(i,split.dim));
+	}
+	new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += counts(i, split.dim);
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += counts(i, split.dim);
+        }
+        return rightCount;
+      }
+
+    private:
+      BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+      vuint4   _counts[BINS];    //!< counts number of primitives that map into the bins
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+   /*! mapping into bins */
+   template<>
+     struct BinMapping<16>
+   {
+   public:
+     __forceinline BinMapping() {}
+      
+     /*! calculates the mapping */
+     template<typename PrimInfo>
+     __forceinline BinMapping(const PrimInfo& pinfo)
+     {
+       num = 16;
+       const vfloat4 eps = 1E-34f;
+       const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+       scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+       ofs  = (vfloat4) pinfo.centBounds.lower;
+       scale16 = scale;
+       ofs16 = ofs;
+     }
+
+     /*! returns number of bins */
+     __forceinline size_t size() const { return num; }
+
+     __forceinline vint16 bin16(const Vec3fa& p) const {
+       return vint16(vint4(floori((vfloat4(p)-ofs)*scale)));
+     }
+
+     __forceinline vint16 bin16(const vfloat16& p) const {
+       return floori((p-ofs16)*scale16);
+     }
+
+     __forceinline int bin_unsafe(const PrimRef& ref,
+                                  const vint16&  vSplitPos,
+                                  const vbool16& splitDimMask) const // FIXME: rename to isLeft
+     {
+       const vfloat16 lower(*(vfloat4*)&ref.lower);
+       const vfloat16 upper(*(vfloat4*)&ref.upper);
+       const vfloat16 p = lower + upper;
+       const vint16 i = floori((p-ofs16)*scale16);
+       return lt(splitDimMask,i,vSplitPos);
+     }
+
+     /*! returns true if the mapping is invalid in some dimension */
+     __forceinline bool invalid(const size_t dim) const {
+       return scale[dim] == 0.0f;
+     }
+        
+    public:
+      size_t num;
+      vfloat4 ofs,scale;         //!< linear function that maps to bin ID
+      vfloat16 ofs16,scale16;    //!< linear function that maps to bin ID
+    };
+
+    /* 16 bins in-register binner */
+    template<typename PrimRef>
+      struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa>
+    {
+      typedef BinSplit<16> Split;
+      typedef vbool16 vbool;
+      typedef vint16 vint;
+      typedef vfloat16 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+      
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        lower[0] = lower[1] = lower[2] = pos_inf;
+        upper[0] = upper[1] = upper[2] = neg_inf;
+        count[0] = count[1] = count[2] = 0;
+      }
+
+
+      static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = reverse_prefix_min(min_x);
+        const vfloat16 r_min_y = reverse_prefix_min(min_y);
+        const vfloat16 r_min_z = reverse_prefix_min(min_z);
+        const vfloat16 r_max_x = reverse_prefix_max(max_x);
+        const vfloat16 r_max_y = reverse_prefix_max(max_y);
+        const vfloat16 r_max_z = reverse_prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_rl;
+      }
+
+      static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = prefix_min(min_x);
+        const vfloat16 r_min_y = prefix_min(min_y);
+        const vfloat16 r_min_z = prefix_min(min_z);
+        const vfloat16 r_max_x = prefix_max(max_x);
+        const vfloat16 r_max_y = prefix_max(max_y);
+        const vfloat16 r_max_z = prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_lr;
+      }
+
+
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping)
+      {
+        if (unlikely(N == 0)) return;
+
+        const vfloat16 init_min(pos_inf);
+        const vfloat16 init_max(neg_inf);
+
+        vfloat16 min_x0,min_x1,min_x2;
+        vfloat16 min_y0,min_y1,min_y2;
+        vfloat16 min_z0,min_z1,min_z2;
+        vfloat16 max_x0,max_x1,max_x2;
+        vfloat16 max_y0,max_y1,max_y2;
+        vfloat16 max_z0,max_z1,max_z2;
+        vuint16 count0,count1,count2;
+
+        min_x0 = init_min;
+        min_x1 = init_min;
+        min_x2 = init_min;
+        min_y0 = init_min;
+        min_y1 = init_min;
+        min_y2 = init_min;
+        min_z0 = init_min;
+        min_z1 = init_min;
+        min_z2 = init_min;
+
+        max_x0 = init_max;
+        max_x1 = init_max;
+        max_x2 = init_max;
+        max_y0 = init_max;
+        max_y1 = init_max;
+        max_y2 = init_max;
+        max_z0 = init_max;
+        max_z1 = init_max;
+        max_z2 = init_max;
+
+        count0 = zero;
+        count1 = zero;
+        count2 = zero;
+
+        const vint16 step16(step);
+        size_t i;
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          const BBox3fa primA = prims[i+0].bounds();
+          const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper);
+          const vint16 binA = mapping.bin16(centerA);
+
+          const BBox3fa primB = prims[i+1].bounds();
+          const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper); 
+          const vint16 binB = mapping.bin16(centerB);
+
+          /* A */
+          {
+            const vfloat16 b_min_x = prims[i+0].lower.x;
+            const vfloat16 b_min_y = prims[i+0].lower.y;
+            const vfloat16 b_min_z = prims[i+0].lower.z;
+            const vfloat16 b_max_x = prims[i+0].upper.x;
+            const vfloat16 b_max_y = prims[i+0].upper.y;
+            const vfloat16 b_max_z = prims[i+0].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binA);
+            const vint16 bin1 = shuffle<1>(binA);
+            const vint16 bin2 = shuffle<2>(binA);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+
+          /* B */
+          {
+            const vfloat16 b_min_x = prims[i+1].lower.x;
+            const vfloat16 b_min_y = prims[i+1].lower.y;
+            const vfloat16 b_min_z = prims[i+1].lower.z;
+            const vfloat16 b_max_x = prims[i+1].upper.x;
+            const vfloat16 b_max_y = prims[i+1].upper.y;
+            const vfloat16 b_max_z = prims[i+1].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binB);
+            const vint16 bin1 = shuffle<1>(binB);
+            const vint16 bin2 = shuffle<2>(binB);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+        }
+
+        if (i < N)
+        {
+          const BBox3fa prim0 = prims[i].bounds();
+          const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); 
+          const vint16 bin = mapping.bin16(center0);
+
+          const vfloat16 b_min_x = prims[i].lower.x;
+          const vfloat16 b_min_y = prims[i].lower.y;
+          const vfloat16 b_min_z = prims[i].lower.z;
+          const vfloat16 b_max_x = prims[i].upper.x;
+          const vfloat16 b_max_y = prims[i].upper.y;
+          const vfloat16 b_max_z = prims[i].upper.z;
+
+          const vint16 bin0 = shuffle<0>(bin);
+          const vint16 bin1 = shuffle<1>(bin);
+          const vint16 bin2 = shuffle<2>(bin);
+
+          const vbool16 m_update_x = step16 == bin0;
+          const vbool16 m_update_y = step16 == bin1;
+          const vbool16 m_update_z = step16 == bin2;
+
+          assert(popcnt((size_t)m_update_x) == 1);
+          assert(popcnt((size_t)m_update_y) == 1);
+          assert(popcnt((size_t)m_update_z) == 1);
+
+          min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+          min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+          min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+          max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+          max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+          min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+          min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+          // ------------------------------------------------------------------------      
+          max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+          max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+          max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+          min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+          min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+          max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+          max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+          // ------------------------------------------------------------------------
+          count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+          count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+          count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+        }
+
+        lower[0] = Vec3vf16( min_x0, min_y0, min_z0 );
+        lower[1] = Vec3vf16( min_x1, min_y1, min_z1 );
+        lower[2] = Vec3vf16( min_x2, min_y2, min_z2 );
+
+        upper[0] = Vec3vf16( max_x0, max_y0, max_z0 );
+        upper[1] = Vec3vf16( max_x1, max_y1, max_z1 );
+        upper[2] = Vec3vf16( max_x2, max_y2, max_z2 );
+
+        count[0] = count0;
+        count[1] = count1;
+        count[2] = count2;
+      }
+
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+        for (size_t i=0; i<3; i++)
+        {
+          lower[i]  = min(lower[i],other.lower[i]);
+          upper[i]  = max(upper[i],other.upper[i]);
+          count[i] += other.count[i];
+        }
+      }
+
+      /*! reducesr binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<3; i++) 
+        {
+          c.counts[i] = a.counts[i] + b.counts[i];
+          c.lower[i]  = min(a.lower[i],b.lower[i]);
+          c.upper[i]  = max(a.upper[i],b.upper[i]);
+        }
+        return c;
+      }
+
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const
+      {
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	const vuint16 blocks_add = (1 << blocks_shift)-1;
+        const vfloat16 inf(pos_inf);
+	for (size_t dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+
+          const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vuint16  lCount16 = prefix_sum(count[dim]);
+          const vuint16  rCount16 = reverse_prefix_sum(count[dim]); 
+
+          /* compute best split in this dimension */
+          const vfloat16 leftArea  = lArea16;
+          const vfloat16 rightArea = align_shift_right<1>(zero,rArea16);
+          const vuint16 lC = lCount16;
+          const vuint16 rC = align_shift_right<1>(zero,rCount16);
+          const vuint16 leftCount  = ( lC + blocks_add) >> blocks_shift;
+          const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift;
+          const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries
+          const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf));
+          /* test if this is a better dimension */
+          if (any(sah < vfloat16(bestSAH))) 
+          {
+            const size_t index = select_min(sah);            
+            assert(index < 15);
+            assert(sah[index] < bestSAH);
+            bestDim = dim;
+            bestPos = index+1;
+            bestSAH = sah[index];
+          }
+        }
+	
+	return Split(bestSAH,bestDim,bestPos,mapping);
+
+      }
+
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfo(0,empty,0,empty);
+	  return;
+	}
+	// FIXME: horizontal reduction!
+
+	size_t leftCount = 0;
+	BBox3fa leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  leftBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	size_t rightCount = 0;
+	BBox3fa rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  rightBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += count[split.dim][i];
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += count[split.dim][i];
+        }
+        return rightCount;
+      }
+            
+    private:
+      Vec3vf16 lower[3];
+      Vec3vf16 upper[3];
+      vuint16   count[3];
+    };
+#endif
+  }
+
+  template<typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..a4c272f015
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,205 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+    {
+      __forceinline PrimInfoRange () {
+      }
+
+      __forceinline PrimInfoRange(const PrimInfo& pinfo)
+        : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+      __forceinline PrimInfoRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+      __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+    
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct HeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 4*768; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+        __forceinline HeuristicArrayBinningSAH ()
+          : prims(nullptr) {}
+
+        /*! remember prim array */
+        __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+          : prims(prims) {}
+
+        /*! finds the best split */
+        __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_template<false>(pinfo,logBlockSize);
+          else
+            return find_template<true>(pinfo,logBlockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            split_template<false>(split,pinfo,linfo,rinfo);
+          else
+            split_template<true>(split,pinfo,linfo,rinfo);
+        }
+
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename Binner::vint vSplitPos(splitPos);
+          const typename Binner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          size_t center = 0;
+          if (!parallel)
+            center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(
+              prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+              [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+              [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+              PARALLEL_PARTITION_BLOCK_SIZE);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+
+        void deterministic_order(const PrimInfoRange& pinfo)
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+        }
+
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          const size_t begin = pinfo.begin();
+          const size_t end   = pinfo.end();
+          const size_t center = (begin + end)/2;
+
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&linfo) PrimInfoRange(begin,center,left);
+
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rinfo) PrimInfoRange(center,end,right);
+        }
+
+        void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          assert(range.size() > 1);
+          CentGeomBBox3fa left(empty);
+          CentGeomBBox3fa right(empty);
+          unsigned int geomID = prims[range.begin()].geomID();
+          size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+                                              [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+                                              [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+          new (&linfo) PrimInfoRange(range.begin(),center,left);
+          new (&rinfo) PrimInfoRange(center,range.end(),right);
+        }
+
+      private:
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct HeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..1370244586
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct UnalignedHeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+          : scene(nullptr), prims(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+          : scene(scene), prims(prims) {}
+
+        const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+        {
+          Vec3fa axis(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const unsigned int geomID = prims[i].geomID();
+            const unsigned int primID = prims[i].primID();
+            const uint64_t geomprimID = prims[i].ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+          return frame(axis).transposed();
+        }
+        
+        const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+        {
+          auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+            {
+              CentGeomBBox3fa bounds(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                Geometry* mesh = scene->get(prims[i].geomID());
+                bounds.extend(mesh->vbounds(space,prims[i].primID()));
+              }
+              return bounds;
+            };
+          
+          const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), 
+                                                         CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+          return PrimInfo(set.begin(),set.end(),bounds);
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+            : scene(scene), space(space) {}
+          
+            /*! returns center for binning */
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            return embree::center2(bounds);
+          }
+          
+          /*! returns bounds and centroid used for binning */
+          __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            bounds_o = bounds;
+            center_o = embree::center2(bounds);
+          }
+
+        private:
+          Scene* scene;
+          const LinearSpace3fa space;
+        };
+        
+        /*! finds the best split */
+        __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          if (likely(pinfo.size() < 10000))
+            return find_template<false>(pinfo,logBlockSize,space);
+          else
+            return find_template<true>(pinfo,logBlockSize,space);
+        }
+
+        /*! finds the best split */
+        template<bool parallel>
+        const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(set);
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+          bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (likely(set.size() < 10000))
+            split_template<false>(split,space,set,lset,rset);
+          else
+            split_template<true>(split,space,set,lset,rset);
+        }
+
+        /*! array partitioning */
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const int splitPos = split.pos;
+          const int splitDim = split.dim;
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+          size_t center = 0;
+          if (likely(set.size() < 10000))
+            center = serial_partitioning(prims,begin,end,local_left,local_right,
+                                         [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+                                           [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                           [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+                                           [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+                                           128);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+        
+        void deterministic_order(const range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[set.begin()],&prims[set.end()]);
+        }
+        
+        void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+          
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&lset) PrimInfoRange(begin,center,left);
+          
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rset) PrimInfoRange(center,end,right);
+        }
+        
+      private:
+        Scene* const scene;
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct UnalignedHeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        UnalignedHeuristicArrayBinningMB(Scene* scene)
+        : scene(scene) {}
+
+        const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+        {
+          Vec3fa axis0(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const PrimRefMB& prim = (*set.prims)[i];
+            const unsigned int geomID = prim.geomID();
+            const unsigned int primID = prim.primID();
+            const uint64_t geomprimID = prim.ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            
+            const Geometry* mesh = scene->get(geomID);
+            const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+            if (tbounds.size() == 0) continue;
+
+            const size_t t = (tbounds.begin()+tbounds.end())/2;
+            const Vec3fa axis1 = mesh->computeDirection(primID,t);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis0 = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+
+          return frame(axis0).transposed();
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+            : scene(scene), time_range(time_range), space(space) {}
+          
+          /*! returns center for binning */
+          template<typename PrimRef>
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            return center2(lbounds.interpolate(0.5f));
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds.interpolate(0.5f);
+            center_o = center2(bounds_o);
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds;
+            center_o = center2(lbounds.interpolate(0.5f));
+          }
+          
+        private:
+          Scene* scene;
+          BBox1f time_range;
+          const LinearSpace3fa space;
+        };
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+        }
+
+      private:
+        Scene* scene;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..21f18c0208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO: 
+//       - adjust parallel build thresholds
+//       - openNodesBasedOnExtend should consider max extended size
+  
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD   0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+      struct HeuristicArrayOpenMergeSAH
+      {
+        typedef BinSplit<OBJECT_BINS> Split;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+        
+        static const size_t PARALLEL_THRESHOLD = 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+        __forceinline HeuristicArrayOpenMergeSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+          : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) 
+        {
+          assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+        }
+
+        struct OpenHeuristic
+        {
+          __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+          {
+            const Vec3fa diag = pinfo.geomBounds.size();
+            dim = maxDim(diag);
+            assert(diag[dim] > 0.0f);
+            inv_max_extend = 1.0f / diag[dim];
+          }
+
+          __forceinline bool operator () ( PrimRef& prim ) const {
+            return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+          }
+
+        private:
+          size_t dim;
+          float inv_max_extend;
+        };
+
+        /*! compute extended ranges */
+        __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+        __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const unsigned int geomID = prims0[set.begin()].geomID();
+          
+          auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { 
+            bool commonGeomID = true;
+            size_t opens = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              commonGeomID &= prims0[i].geomID() == geomID; 
+              if (heuristic(prims0[i]))
+                opens += prims0[i].node.getN()-1; // coarse approximation
+            }
+            return std::pair<size_t,bool>(opens,commonGeomID); 
+          };
+          auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { 
+            return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); 
+          };
+          return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+        }
+
+        // FIXME: should consider maximum available extended size 
+        __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const size_t ext_range_start = set.end();
+
+          if (false && set.size() < PARALLEL_THRESHOLD) 
+          {
+            size_t extra_elements = 0;
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0; j<n; j++)
+                  set.extend_center2(tmp[j]);
+
+                prims0[i] = tmp[0];
+                for (size_t j=1; j<n; j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+              }
+            }
+            set._end += extra_elements;
+          }
+          else 
+          {
+            std::atomic<size_t> ext_elements;
+            ext_elements.store(0);
+            PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+                PrimInfo info(empty);
+                for (size_t i=r.begin(); i<r.end(); i++)
+                  if (heuristic(prims0[i]))
+                  {
+                    PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                    const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                    const size_t ID = ext_elements.fetch_add(n-1);
+                    assert(ID + n-1 <= set.ext_range_size());
+
+                    for (size_t j=0; j<n; j++)
+                      info.extend_center2(tmp[j]);
+
+                    prims0[i] = tmp[0];
+                    for (size_t j=1; j<n; j++)
+                      prims0[ext_range_start+ID+j-1] = tmp[j]; 
+                  }
+                return info;
+              }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+            set.centBounds.extend(info.centBounds);
+            assert(ext_elements.load() <= set.ext_range_size());
+            set._end += ext_elements.load();
+          }
+        } 
+
+        __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+        {
+          const OpenHeuristic heuristic(set);
+          size_t next_iteration_extra_elements = est_new_elements;          
+          
+          while (next_iteration_extra_elements <= set.ext_range_size()) 
+          {
+            next_iteration_extra_elements = 0;
+            size_t extra_elements = 0;
+            const size_t ext_range_start = set.end();
+
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0;j<n;j++)
+                  set.extend_center2(tmp[j]);
+                  
+                prims0[i] = tmp[0];
+                for (size_t j=1;j<n;j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+
+                for (size_t j=0; j<n; j++)
+                  if (heuristic(tmp[j]))
+                    next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+              }
+            }
+            assert( extra_elements <= set.ext_range_size());
+            set._end += extra_elements;
+
+            for (size_t i=set.begin();i<set.end();i++)
+              assert(prims0[i].numPrimitives() > 0);
+
+            if (unlikely(next_iteration_extra_elements == 0)) break;
+          }
+        } 
+
+        __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          /* single element */
+          if (set.size() <= 1)
+            return Split();
+
+          /* disable opening if there is no overlap */
+          const size_t D = 4;
+          if (unlikely(set.has_ext_range() && set.size() <= D))
+          {
+            bool disjoint = true;
+            for (size_t j=set.begin(); j<set.end()-1; j++) {
+              for (size_t i=set.begin()+1; i<set.end(); i++) {
+                if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { 
+                  disjoint = false; break; 
+                }
+              }
+            }
+            if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+          }
+
+          std::pair<size_t,bool> p(0,false);
+
+          /* disable opening when all primitives are from same geometry */
+          if (unlikely(set.has_ext_range()))
+          {
+            p =  getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+            if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif         
+          }
+
+          /* open nodes when we have sufficient space available */
+          if (unlikely(set.has_ext_range()))
+          {
+#if USE_LOOP_OPENING == 1
+            openNodesBasedOnExtendLoop(set,p.first);
+#else
+            if (p.first <= set.ext_range_size())
+              openNodesBasedOnExtend(set);
+#endif
+
+            /* disable opening when unsufficient space for opening a node available */
+            if (set.ext_range_size() < max_open_size-1) 
+              set.set_ext_range(set.end()); /* disable opening */
+          }
+                    
+          /* find best split */
+          return object_find(set,logBlockSize);
+        }
+
+
+        /*! finds the best object split */
+        __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+          else                                 return parallel_object_find  (set,logBlockSize);
+        }
+
+        /*! finds the best object split */
+        __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! finds the best split */
+        __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          auto body = [&] (const range<size_t>& r) -> Binner { 
+            Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; 
+          };
+          auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { 
+            Binner r = b0; r.merge(b1,_mapping.size()); return r; 
+          };
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            splitFallback(set,lset,rset);
+            return;
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          /* object split */
+          if (likely(set.size() < PARALLEL_THRESHOLD)) 
+            ext_weights = sequential_object_split(split,set,lset,rset);
+          else
+            ext_weights = parallel_object_split(split,set,lset,rset);
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });          
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+          auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+            [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+
+          return std::pair<size_t,size_t>(left.size(),right.size());
+        }
+
+        void deterministic_order(const extended_range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.add_center2(prims0[i]);
+
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++)
+            right.add_center2(prims0[i]);	
+
+          const size_t rweight = right.end;
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) 
+          {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const NodeOpenerFunc& nodeOpenerFunc;
+        size_t max_open_size;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..d8ca6cb92c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+  static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+  namespace isa
+  {
+
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct SpatialBinMapping
+      {
+      public:
+        __forceinline SpatialBinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+        {
+          const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+          const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+          const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+          scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+          ofs  = (vfloat4) pinfo.geomBounds.lower;
+          inv_scale = 1.0f / scale; 
+        }
+
+        /*! slower but safe binning */
+        __forceinline vint4 bin(const Vec3fa& p) const
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+          return clamp(i,vint4(0),vint4(BINS-1));
+        }
+
+        __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+        {
+#if defined(__AVX__)
+          const vfloat8 ofs8(ofs);
+          const vfloat8 scale8(scale);
+          const vint8 lu   = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+          const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+          return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+          const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+          const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+          const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+          const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+          return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+        }
+
+        
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),inv_scale[dim],ofs[dim]);
+        }
+
+        /*! calculates left spatial position of bin */
+        template<size_t N>
+        __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+          return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+        }
+        
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+      public:
+        vfloat4 ofs,scale,inv_scale;  //!< linear function that maps to bin ID
+      };
+
+    /*! stores all information required to perform some split */
+    template<size_t BINS>
+      struct SpatialBinSplit
+      {
+        /*! construct an invalid split by default */
+        __forceinline SpatialBinSplit() 
+          : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+        
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+          return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+        }
+        
+      public:
+        float sah;                 //!< SAH cost of the split
+        int   dim;                 //!< split dimension
+        int   pos;                 //!< split position
+        int   left;                //!< number of elements on the left side
+        int   right;               //!< number of elements on the right side
+        float factor;              //!< factor splitting the extended range
+        SpatialBinMapping<BINS> mapping; //!< mapping into bins
+      };    
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef>
+      struct __aligned(64) SpatialBinInfo
+    {
+      SpatialBinInfo() {
+      }
+
+      __forceinline SpatialBinInfo(EmptyTy) {
+	clear();
+      }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        for (size_t i=0; i<BINS; i++) { 
+          bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+          numBegin[i] = numEnd[i] = 0;
+        }
+      }
+      
+      /*! adds binning data */
+      __forceinline void add(const size_t dim,
+                             const size_t beginID, 
+                             const size_t endID, 
+                             const size_t binID, 
+                             const BBox3fa &b,
+                             const size_t n = 1) 
+      {
+        assert(beginID < BINS);
+        assert(endID < BINS);
+        assert(binID < BINS);
+
+        numBegin[beginID][dim]+=(unsigned int)n;
+        numEnd  [endID][dim]+=(unsigned int)n;
+        bounds  [binID][dim].extend(b);        
+      }
+
+      /*! extends binning bounds */
+      __forceinline void extend(const size_t dim,
+                                const size_t binID, 
+                                const BBox3fa &b) 
+      {
+        assert(binID < BINS);
+        bounds  [binID][dim].extend(b);        
+      }
+      
+      /*! bins an array of triangles */
+      template<typename SplitPrimitive>
+        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=0; i<N; i++)
+        {
+          const PrimRef prim = prims[i];
+          unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+          if (unlikely(splits == 1))
+          {
+            const vint4 bin = mapping.bin(center(prim.bounds()));
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+              numBegin[bin[dim]][dim]++;
+              numEnd  [bin[dim]][dim]++;
+              bounds  [bin[dim]][dim].extend(prim.bounds());
+            }
+          } 
+          else
+          {
+            const vint4 bin0 = mapping.bin(prim.bounds().lower);
+            const vint4 bin1 = mapping.bin(prim.bounds().upper);
+            
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              size_t bin;
+              PrimRef rest = prim;
+              size_t l = bin0[dim];
+              size_t r = bin1[dim];
+
+              // same bin optimization
+              if (likely(l == r)) 
+              {
+                numBegin[l][dim]++;
+                numEnd  [l][dim]++;
+                bounds  [l][dim].extend(prim.bounds());
+                continue;
+              }
+
+              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              {
+                const float pos = mapping.pos(bin+1,dim);
+                
+                PrimRef left,right;
+                splitPrimitive(rest,(int)dim,pos,left,right);
+                if (unlikely(left.bounds().empty())) l++;                
+                bounds[bin][dim].extend(left.bounds());
+                rest = right;
+              }
+              if (unlikely(rest.bounds().empty())) r--;
+              numBegin[l][dim]++;
+              numEnd  [r][dim]++;
+              bounds  [bin][dim].extend(rest.bounds());
+            }
+          }
+        }
+      }
+      
+      /*! bins a range of primitives inside an array */
+      template<typename SplitPrimitive>
+        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+	bin(splitPrimitive,prims+begin,end-begin,mapping);
+      }
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            size_t bin;
+            size_t l = bin0[dim];
+            size_t r = bin1[dim];
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds());
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            BBox3fa rest = prim.bounds();
+            const auto splitter = splitterFactory(prim);
+            for (bin=bin_start; bin<bin_end; bin++) 
+            {
+              const float pos = mapping.pos(bin+1,dim);
+              BBox3fa left,right;
+              splitter(rest,dim,pos,left,right);
+              if (unlikely(left.empty())) l++;                
+              extend(dim,bin,left);
+              rest = right;
+            }
+            if (unlikely(rest.empty())) r--;
+            add(dim,l,r,bin,rest);
+          }
+        }              
+      }
+
+
+
+      /*! bins an array of primitives */
+      __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            const size_t l = bin0[dim];
+            const size_t r = bin1[dim];
+
+            const unsigned int n  = prim.primID();
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds(),n);
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            for (size_t bin=bin_start; bin<bin_end; bin++) 
+              add(dim,l,r,bin,prim.bounds(),n);
+          }
+        }              
+      }
+      
+      /*! merges in other binning information */
+      void merge (const SpatialBinInfo& other)
+      {
+        for (size_t i=0; i<BINS; i++) 
+        {
+          numBegin[i] += other.numBegin[i];
+          numEnd  [i] += other.numEnd  [i];
+          bounds[i][0].extend(other.bounds[i][0]);
+          bounds[i][1].extend(other.bounds[i][1]);
+          bounds[i][2].extend(other.bounds[i][2]);
+        }
+      }
+
+      /*! merges in other binning information */
+      static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+      {
+        SpatialBinInfo c(empty);
+        for (size_t i=0; i<BINS; i++) 
+        {
+          c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+          c.numEnd  [i] += a.numEnd  [i]+b.numEnd  [i];
+          c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+          c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+          c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const 
+      {
+        /* sweep from right to left and compute parallel prefix of merged bounds */
+        vfloat4 rAreas[BINS];
+        vuint4 rCounts[BINS];
+        vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+        for (size_t i=BINS-1; i>0; i--)
+        {
+          count += numEnd[i];
+          rCounts[i] = count;
+          bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+          by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+          bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+        
+        /* sweep from left to right and compute SAH */
+        vuint4 blocks_add = (1 << blocks_shift)-1;
+        vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+        count = 0; bx = empty; by = empty; bz = empty;
+        for (size_t i=1; i<BINS; i++, ii+=1)
+        {
+          count += numBegin[i-1];
+          bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+          by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+          bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift);
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+          const vbool4 mask = sah < vbestSAH;
+          vbestPos      = select(mask,ii ,vbestPos);
+          vbestSAH      = select(mask,sah,vbestSAH);
+          vbestlCount   = select(mask,count,vbestlCount);
+          vbestrCount   = select(mask,rCounts[i],vbestrCount);
+        }
+        
+        /* find best dimension */
+        float bestSAH = inf;
+        int   bestDim = -1;
+        int   bestPos = 0;
+        unsigned int   bestlCount = 0;
+        unsigned int   bestrCount = 0;
+        for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+            bestlCount = vbestlCount[dim];
+            bestrCount = vbestrCount[dim];
+          }
+        }
+        assert(bestSAH >= 0.0f);
+        
+        /* return invalid split if no split found */
+        if (bestDim == -1) 
+          return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+        
+        /* return best found split */
+        return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+      }
+      
+    private:
+      BBox3fa bounds[BINS][3];  //!< geometry bounds for each bin in each dimension
+      vuint4    numBegin[BINS];   //!< number of primitives starting in bin
+      vuint4    numEnd[BINS];     //!< number of primitives ending in bin
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..911dcf950c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,552 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+    struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+    {
+      __forceinline PrimInfoExtRange() {
+      }
+
+      __forceinline PrimInfoExtRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+      __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+
+    template<typename ObjectSplit, typename SpatialSplit>
+      struct Split2
+      {
+        __forceinline Split2 () {}
+        
+        __forceinline Split2 (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+        }
+        
+        __forceinline Split2& operator= (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+          return *this;
+        }
+          
+          __forceinline     ObjectSplit&  objectSplit()        { return *(      ObjectSplit*)data; }
+        __forceinline const ObjectSplit&  objectSplit() const  { return *(const ObjectSplit*)data; }
+        
+        __forceinline       SpatialSplit& spatialSplit()       { return *(      SpatialSplit*)data; }
+        __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+        
+        __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+          : spatial(false), sah(sah) 
+        {
+          new (data) ObjectSplit(objectSplit);
+        }
+        
+        __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+          : spatial(true), sah(sah) 
+        {
+          new (data) SpatialSplit(spatialSplit);
+        }
+        
+        __forceinline float splitSAH() const { 
+          return sah; 
+        }
+        
+        __forceinline bool valid() const {
+          return sah < float(inf);
+        }
+        
+      public:
+        __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+        bool spatial;
+        float sah;
+      };
+    
+    /*! Performs standard object binning */
+    template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+      struct HeuristicArraySpatialSAH
+      {
+        typedef BinSplit<OBJECT_BINS> ObjectSplit;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+        typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+        typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+        //typedef extended_range<size_t> Set;
+        typedef Split2<ObjectSplit,SpatialSplit> Split;
+        
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 3*1024; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3*1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+        __forceinline HeuristicArraySpatialSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+          : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+        /*! compute extended ranges */
+        __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /*! finds the best split */
+        const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SplitInfo oinfo;
+          const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+          const float object_split_sah = object_split.splitSAH();
+
+          if (unlikely(set.has_ext_range()))
+          {
+            const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+            
+            /* do only spatial splits if the child bounds overlap */
+            if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+                safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+            {              
+              const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+              const float spatial_split_sah = spatial_split.splitSAH();
+
+              /* valid spatial split, better SAH and number of splits do not exceed extended range */
+              if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+                  spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+              {          
+                return Split(spatial_split,spatial_split_sah);
+              }
+            }
+          }
+
+          return Split(object_split,object_split_sah);
+        }
+
+        /*! finds the best object split */
+        __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+          else                                 return parallel_object_find  (set,logBlockSize,info);
+        }
+
+        /*! finds the best object split */
+        __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best split */
+        __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+                                   [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best spatial split */
+        __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+          else                                 return parallel_spatial_find  (set, logBlockSize);
+        }
+
+        /*! finds the best spatial split */
+        __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty); 
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+        __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty);
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> SpatialBinner { 
+                                     SpatialBinner binner(empty); 
+                                     binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+                                     return binner; },
+                                   [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+
+        /*! subdivides primitives based on a spatial split */
+        __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+        {
+          assert(set.has_ext_range());
+          const size_t max_ext_range_size = set.ext_range_size();
+          const size_t ext_range_start = set.end();
+
+          /* atomic counter for number of primref splits */
+          std::atomic<size_t> ext_elements;
+          ext_elements.store(0);
+          
+          const float fpos = split.mapping.pos(split.pos,split.dim);
+        
+          const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+          parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+              for (size_t i=r.begin();i<r.end();i++)
+              {
+                const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+                if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+                //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+                //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+                //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+                if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+                {
+                  assert(splits > 1);
+
+                  PrimRef left,right;
+                  const auto splitter = splitterFactory(prims0[i]);
+                  splitter(prims0[i],split.dim,fpos,left,right);
+                
+                  // no empty splits
+                  if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+                
+                  left.lower.u  = (left.lower.u  & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+                  right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+                  const size_t ID = ext_elements.fetch_add(1);
+
+                  /* break if the number of subdivided elements are greater than the maximum allowed size */
+                  if (unlikely(ID >= max_ext_range_size)) 
+                    break;
+
+                  /* only write within the correct bounds */
+                  assert(ID < max_ext_range_size);
+                  prims0[i] = left;
+                  prims0[ext_range_start+ID] = right;     
+                }
+              }
+            });
+
+          const size_t numExtElements = min(max_ext_range_size,ext_elements.load());          
+          assert(set.end()+numExtElements<=set.ext_end());
+          set._end += numExtElements;
+        }
+        
+        /*! array partitioning */
+        void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+          
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          if (unlikely(split.spatial))
+          {
+            create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); 
+
+            /* spatial split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+          }
+          else
+          {
+            /* object split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+          }
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { 
+                                                return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });          
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) {
+                                                const Vec3fa c = ref.bounds().center();
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          auto isLeft = [&] (const PrimRef &ref) { 
+            const Vec3fa c = ref.bounds().center();
+            return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        void deterministic_order(const PrimInfoExtRange& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        void splitFallback(const PrimInfoExtRange& set, 
+                           PrimInfoExtRange& lset, 
+                           PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++) {
+            left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+          }
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++) {
+            right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));	
+          }
+          const size_t rweight = right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const PrimitiveSplitterFactory& splitterFactory;
+        const CentGeomBBox3fa& root_info;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..ede0d04c78
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    struct HeuristicStrandSplit
+    {
+      typedef range<size_t> Set;
+  
+      static const size_t PARALLEL_THRESHOLD = 10000;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+      /*! stores all information to perform some split */
+      struct Split
+      {    
+	/*! construct an invalid split by default */
+	__forceinline Split()
+	  : sah(inf), axis0(zero), axis1(zero) {}
+	
+	/*! constructs specified split */
+	__forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+	  : sah(sah), axis0(axis0), axis1(axis1) {}
+	
+	/*! calculates standard surface area heuristic for the split */
+	__forceinline float splitSAH() const { return sah; }
+
+        /*! test if this split is valid */
+        __forceinline bool valid() const { return sah != float(inf); }
+		
+      public:
+	float sah;             //!< SAH cost of the split
+	Vec3fa axis0, axis1;   //!< axis the two strands are aligned into
+      };
+
+      __forceinline HeuristicStrandSplit () // FIXME: required?
+        : scene(nullptr), prims(nullptr) {}
+      
+      /*! remember prim array */
+      __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+        : scene(scene), prims(prims) {}
+      
+      __forceinline const Vec3fa direction(const PrimRef& prim) {
+        return scene->get(prim.geomID())->computeDirection(prim.primID());
+      }
+      
+      __forceinline const BBox3fa bounds(const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(prim.primID());
+      }
+
+      __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(space,prim.primID());
+      }
+
+      /*! finds the best split */
+      const Split find(const range<size_t>& set, size_t logBlockSize)
+      {
+        Vec3fa axis0(0,0,1);
+        uint64_t bestGeomPrimID = -1;
+
+        /* curve with minimum ID determines first axis */
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          if (geomprimID >= bestGeomPrimID) continue;
+          const Vec3fa axis = direction(prims[i]);
+          if (sqr_length(axis) > 1E-18f) {
+            axis0 = normalize(axis);
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+        float bestCos = 1.0f;
+        Vec3fa axis1 = axis0;
+        bestGeomPrimID = -1;
+        for (size_t i=set.begin(); i<set.end(); i++) 
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          Vec3fa axisi = direction(prims[i]);
+          float leni = length(axisi);
+          if (leni == 0.0f) continue;
+          axisi /= leni;
+          float cos = abs(dot(axisi,axis0));
+          if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+            bestCos = cos; axis1 = axisi;
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* partition the two strands */
+        size_t lnum = 0, rnum = 0;
+        BBox3fa lbounds = empty, rbounds = empty;
+        const LinearSpace3fa space0 = frame(axis0).transposed();
+        const LinearSpace3fa space1 = frame(axis1).transposed();
+        
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          PrimRef& prim = prims[i];
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,axis0));
+          const float cos1 = abs(dot(axisi,axis1));
+          
+          if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+          else             { rnum++; rbounds.extend(bounds(space1,prim)); }
+        }
+      
+        /*! return an invalid split if we do not partition */
+        if (lnum == 0 || rnum == 0) 
+          return Split(inf,axis0,axis1);
+      
+        /*! calculate sah for the split */
+        const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+        return Split(sah,axis0,axis1);
+      }
+
+      /*! array partitioning */
+      void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) 
+      {
+        if (!split.valid()) {
+          deterministic_order(set);
+          return splitFallback(set,lset,rset);
+        }
+        
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        CentGeomBBox3fa local_left(empty);
+        CentGeomBBox3fa local_right(empty);
+
+        auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { 
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,split.axis0));
+          const float cos1 = abs(dot(axisi,split.axis1));
+          return cos0 > cos1;
+        };
+
+        auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { 
+          pinfo.extend(bounds(ref)); 
+        };
+        
+        size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+        
+        new (&lset) PrimInfoRange(begin,center,local_left);
+        new (&rset) PrimInfoRange(center,end,local_right);
+        assert(area(lset.geomBounds) >= 0.0f);
+        assert(area(rset.geomBounds) >= 0.0f);
+      }
+
+      void deterministic_order(const Set& set) 
+      {
+        /* required as parallel partition destroys original primitive order */
+        std::sort(&prims[set.begin()],&prims[set.end()]);
+      }
+      
+      void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+      {
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        const size_t center = (begin + end)/2;
+        
+        CentGeomBBox3fa left(empty);
+        for (size_t i=begin; i<center; i++)
+          left.extend(bounds(prims[i]));
+        new (&lset) PrimInfoRange(begin,center,left);
+        
+        CentGeomBBox3fa right(empty);
+        for (size_t i=center; i<end; i++)
+          right.extend(bounds(prims[i]));	
+        new (&rset) PrimInfoRange(center,end,right);
+      }
+      
+    private:
+      Scene* const scene;
+      PrimRef* const prims;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..c999941a11
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+      struct HeuristicMBlurTemporalSplit
+      {
+        typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+        typedef mvector<PrimRefMB>* PrimRefVector;
+        typedef typename PrimRefMB::BBox BBox; 
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+          : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+        struct TemporalBinInfo
+        {
+          __forceinline TemporalBinInfo () {
+          }
+
+          __forceinline TemporalBinInfo (EmptyTy)
+          {
+            for (size_t i=0; i<BINS-1; i++)
+            {
+              count0[i] = count1[i] = 0;
+              bounds0[i] = bounds1[i] = empty;
+            }
+          }
+          
+          void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+          {
+            for (int b=0; b<BINS-1; b++)
+            {
+              const float t = float(b+1)/float(BINS);
+              const float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* find linear bounds for both time segments */
+              for (size_t i=begin; i<end; i++) 
+              {
+                if (prims[i].time_range_overlap(dt0))
+                {
+                  const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+                  bounds0[b].extend(bn0);
+#else
+                  bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+                  count0[b] += prims[i].timeSegmentRange(dt0).size();
+                }
+
+                if (prims[i].time_range_overlap(dt1))
+                {
+                  const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+                  bounds1[b].extend(bn1);
+#else
+                  bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+                  count1[b] += prims[i].timeSegmentRange(dt1).size();
+                }
+              }
+            }
+          }
+
+          __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) 
+          {
+            if (likely(end-begin < parallelThreshold)) {
+              bin(prims,begin,end,time_range,set,recalculatePrimRef);
+            } 
+            else 
+            {
+              auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { 
+                TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; 
+              };
+              *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+            }
+          }
+          
+          /*! merges in other binning information */
+          __forceinline void merge (const TemporalBinInfo& other)
+          {
+            for (size_t i=0; i<BINS-1; i++) 
+            {
+              count0[i] += other.count0[i];
+              count1[i] += other.count1[i];
+              bounds0[i].extend(other.bounds0[i]);
+              bounds1[i].extend(other.bounds1[i]);
+            }
+          }
+
+          static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+            TemporalBinInfo r = a; r.merge(b); return r;
+          }
+                    
+          Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+          {
+            float bestSAH = inf;
+            float bestPos = 0.0f;
+            for (int b=0; b<BINS-1; b++)
+            {
+              float t = float(b+1)/float(BINS);
+              float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* calculate sah */
+              const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+              float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+              if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+              if (unlikely(rCount == 0)) sah1 = 0.0f;
+              const float sah = sah0+sah1;
+              if (sah < bestSAH) {
+                bestSAH = sah;
+                bestPos = center_time;
+              }
+            }
+            return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+          }
+          
+        public:
+          size_t count0[BINS-1];
+          size_t count1[BINS-1];
+          BBox bounds0[BINS-1];
+          BBox bounds1[BINS-1];
+        };
+        
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          assert(set.size() > 0);
+          TemporalBinInfo binner(empty);
+          binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+          Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+          if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return tsplit;
+        }
+
+        __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          assert(tsplit.sah != float(inf));
+          assert(tsplit.fpos > set.time_range.lower);
+          assert(tsplit.fpos < set.time_range.upper);
+
+          float center_time = tsplit.fpos;
+          const BBox1f time_range0(set.time_range.lower,center_time);
+          const BBox1f time_range1(center_time,set.time_range.upper);
+          mvector<PrimRefMB>& prims = *set.prims;
+          
+          /* calculate primrefs for first time range */
+          std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+          PrimRefVector lprims = new_vector.get();
+          
+          auto reduction_func0 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range0)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+                (*lprims)[i-set.begin()] = prim;
+                pinfo.add_primref(prim);
+              }
+              else
+              {
+                (*lprims)[i-set.begin()] = prims[i];
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+          /* primrefs for first time range are in lprims[0 .. set.size()) */
+          /* some primitives may need to be filtered out */
+          if (linfo.size() != set.size())
+            linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+                      
+          lset = SetMB(linfo,lprims,time_range0);
+
+          /* calculate primrefs for second time range */
+          auto reduction_func1 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range1)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+                prims[i] = prim;
+                pinfo.add_primref(prim);
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+          rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+          /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+          /* some primitives may need to be filtered out */
+          if (rinfo.size() != set.size())
+            rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+        
+          rset = SetMB(rinfo,&prims,time_range1);
+
+          return new_vector;
+        }
+
+      private:
+        MemoryMonitorInterface* device;              // device to report memory usage to
+        const RecalculatePrimRef recalculatePrimRef;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..06c1388742
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  // FIXME: maybe there's a better place for this util fct
+  __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+  {
+    const Vec3fa e0 = v1-v0;
+    const Vec3fa e1 = v2-v0;
+    const Vec3fa d = cross(e0,e1);
+    return fabs(d.x) + fabs(d.y) + fabs(d.z);
+  }
+
+  //namespace isa
+  //{
+    template<typename BBox>
+      class CentGeom
+    {
+    public:
+      __forceinline CentGeom () {}
+
+      __forceinline CentGeom (EmptyTy) 
+	: geomBounds(empty), centBounds(empty) {}
+      
+      __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) 
+	: geomBounds(geomBounds), centBounds(centBounds) {}
+      
+      template<typename PrimRef> 
+        __forceinline void extend_primref(const PrimRef& prim) 
+      {
+        BBox bounds; Vec3fa center;
+        prim.binBoundsAndCenter(bounds,center);
+        geomBounds.extend(bounds);
+        centBounds.extend(center);
+      }
+
+       template<typename PrimRef> 
+         __forceinline void extend_center2(const PrimRef& prim) 
+       {
+         BBox3fa bounds = prim.bounds();
+         geomBounds.extend(bounds);
+         centBounds.extend(bounds.center2());
+       }
+       
+      __forceinline void extend(const BBox& geomBounds_) {
+	geomBounds.extend(geomBounds_);
+	centBounds.extend(center2(geomBounds_));
+      }
+
+      __forceinline void merge(const CentGeom& other) 
+      {
+	geomBounds.extend(other.geomBounds);
+	centBounds.extend(other.centBounds);
+      }
+
+      static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+        CentGeom r = a; r.merge(b); return r;
+      }
+
+    public:
+      BBox geomBounds;   //!< geometry bounds of primitives
+      BBox3fa centBounds;   //!< centroid bounds of primitives
+    };
+
+    typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoT () {}
+
+      __forceinline PrimInfoT (EmptyTy) 
+	: CentGeom<BBox>(empty), begin(0), end(0) {}
+
+      __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        end++;
+      }
+
+       template<typename PrimRef> 
+         __forceinline void add_center2(const PrimRef& prim) {
+         CentGeom<BBox>::extend_center2(prim);
+         end++;
+       }
+
+        template<typename PrimRef> 
+          __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+          CentGeom<BBox>::extend_center2(prim);
+          end+=i;
+        }
+
+      /*__forceinline void add(const BBox& geomBounds_) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end++;
+      }
+
+      __forceinline void add(const BBox& geomBounds_, const size_t i) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end+=i;
+        }*/
+
+      __forceinline void merge(const PrimInfoT& other) 
+      {
+	CentGeom<BBox>::merge(other);
+        begin += other.begin;
+	end += other.end;
+      }
+
+      static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+        PrimInfoT r = a; r.merge(b); return r;
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return end-begin; 
+      }
+
+      __forceinline float halfArea() {
+        return expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+	//return halfArea(geomBounds)*float((num+3) >> 2);
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+	return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+      }
+      
+    public:
+      size_t begin,end;          //!< number of primitives
+    };
+
+    typedef PrimInfoT<BBox3fa> PrimInfo;
+    //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..e23de3df28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
@@ -0,0 +1,244 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
+    template<typename Mesh>
+    size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+    {
+      size_t numPrimitives = morton.size();
+
+      /* compute scene bounds */
+      std::pair<size_t,BBox3fa> cb_empty(0,empty);
+      auto cb = parallel_reduce 
+        ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+          {
+            size_t num = 0;
+            BBox3fa bounds = empty;
+            
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa prim_bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+              bounds.extend(center2(prim_bounds));
+              num++;
+            }
+            return std::make_pair(num,bounds);
+          }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+          return std::make_pair(a.first + b.first,merge(a.second,b.second)); 
+        });
+      
+      
+      size_t numPrimitivesGen = cb.first;
+      const BBox3fa centBounds = cb.second;
+      
+      /* compute morton codes */
+      if (likely(numPrimitivesGen == numPrimitives))
+      {
+        /* fast path if all primitives were valid */
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+              generator(mesh->bounds(j),unsigned(j));
+          });
+      }
+      else
+      {
+        /* slow path, fallback in case some primitives were invalid */
+        ParallelPrefixSumState<size_t> pstate;
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());
+        
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (!mesh->buildBounds(j,&bounds)) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());          
+      }
+      return numPrimitivesGen;
+    }
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    // template for grid meshes
+
+#if 0
+    template<>
+    PrimInfo createPrimRefArray<GridMesh,false>(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      PING;
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator<GridMesh,false> iter(scene);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k) -> PrimInfo
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!mesh->buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, const PrimInfo& base) -> PrimInfo
+        {
+          k = base.size();
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            BBox3fa bounds = empty;
+            if (!mesh->buildBounds(j,&bounds)) continue;
+            const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+            pinfo.add_center2(prim);
+            prims[k++] = prim;
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+#endif
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..9919c945c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
@@ -0,0 +1,28 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+    template<typename Mesh>
+      size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8bdb38b955
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)   
+#define CHECK_PRESPLIT(x) 
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{  
+  namespace isa
+  {
+
+    struct PresplitItem
+    {
+      union {
+        float priority;    
+        unsigned int data;
+      };
+      unsigned int index;
+      
+      __forceinline operator unsigned() const
+      {
+	return reinterpret_cast<const unsigned&>(priority);
+      }
+      __forceinline bool operator < (const PresplitItem& item) const
+      {
+	return (priority < item.priority);
+      }
+
+      template<typename Mesh>
+      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      {
+	const unsigned int geomID = ref.geomID();
+	const unsigned int primID = ref.primID();
+	const float area_aabb  = area(ref.bounds());
+	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+        const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+        assert(area_prim <= area_aabb);
+        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        assert(priority >= 0.0f && priority < FLT_LARGE);
+	return priority;      
+      }
+
+    
+    };
+
+    inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+      return cout << "index " << item.index << " priority " << item.priority;    
+    };
+
+    template<typename SplitterFactory>    
+      void splitPrimitive(SplitterFactory &Splitter,
+                          const PrimRef &prim,
+                          const unsigned int geomID,
+                          const unsigned int primID,
+                          const unsigned int split_level,
+                          const Vec3fa &grid_base, 
+                          const float grid_scale,
+                          const float grid_extend,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+      if (split_level == 0)
+      {
+        assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+        subPrims[numSubPrims++] = prim;
+      }
+      else
+      {
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+			
+        /* if all bits are equal then we cannot split */
+        if(unlikely(lower_code == upper_code))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = prim;
+          return;
+        }
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+        assert(prim.lower[dim] <= fsplit &&
+               prim.upper[dim] >= fsplit);
+		
+        /* split primitive */
+        const auto splitter = Splitter(prim);
+        BBox3fa left,right;
+        splitter(prim.bounds(),dim,fsplit,left,right);
+        assert(!left.empty());
+        assert(!right.empty());
+
+			    
+        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+      }
+    }
+    
+    
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	  return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	      return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+      return pinfo;	
+    }
+    
+    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    {
+      const Vec3fa lower = ref.lower;
+      const Vec3fa upper = ref.upper;
+      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+      Vec3ia ilower(floor(glower));
+      Vec3ia iupper(floor(gupper));
+      
+      /* this ignores dimensions that are empty */
+      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+      /* compute a morton code for the lower and upper grid coordinates. */
+      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+      return Vec2i(lower_code,upper_code);
+    }
+
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {	
+      static const size_t MIN_STEP_SIZE = 128;
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+      /* use correct number of primitives */
+      size_t numPrimitives = pinfo.size();
+      const size_t alloc_numPrimitives = prims.size(); 
+      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+      /* set up primitive splitter */
+      SplitterFactory Splitter(scene);
+
+
+      DBG_PRESPLIT(
+        const size_t org_numPrimitives = pinfo.size();
+        PRINT(numPrimitives);		
+        PRINT(alloc_numPrimitives);		
+        PRINT(numSplitPrimitivesBudget);
+        );
+
+      /* allocate double buffer presplit items */
+      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+      /* compute grid */
+      const Vec3fa grid_base    = pinfo.geomBounds.lower;
+      const Vec3fa grid_diag    = pinfo.geomBounds.size();
+      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
+      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+      /* init presplit items and get total sum */
+      const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+          float sum = 0.0f;
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {		
+            presplitItem[i].index = (unsigned int)i;
+            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            /* if all bits are equal then we cannot split */
+            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            /* FIXME: sum undeterministic */
+            sum += presplitItem[i].priority;
+          }
+          return sum;
+        },[](const float& a, const float& b) -> float { return a+b; });
+
+      /* compute number of splits per primitive */
+      const float inv_psum = 1.0f / psum;
+      parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {
+            if (presplitItem[i].priority > 0.0f)
+            {
+              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+              {
+                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              }
+              else
+                presplitItem[i].priority = 0.0f;
+            }
+          }
+        });
+
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
+      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+      /* anything to split ? */
+      if (center < numPrimitives)
+      {
+        const size_t numPrimitivesToSplit = numPrimitives - center;
+        assert(presplitItem[center].priority >= 1.0f);
+
+        /* sort presplit items in ascending order */
+        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+        CHECK_PRESPLIT(
+          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+              for (size_t i=r.begin(); i<r.end(); i++)
+                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+            });
+          );
+
+        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+            size_t sum = 0;
+            for (size_t i=t.begin(); i<t.end(); i++)
+            {	
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+              assert(presplitItem[i].priority >= 1.0f);
+              const unsigned int  primrefID = presplitItem[i].index;	
+              const float prio              = presplitItem[i].priority;
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = (unsigned int)prio;
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              assert(numSubPrims);
+              numSubPrims--; // can reuse slot 
+              sum+=numSubPrims;
+              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+              primOffset0[i-center] = numSubPrims;
+            }
+            return sum;
+          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        
+        /* if we are over budget, need to shrink the range */
+        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        {
+          size_t new_center = numPrimitives-1;
+          size_t sum = 0;
+          for (;new_center>=center;new_center--)
+          {
+            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+            sum += numSubPrims;
+          }
+          new_center++;
+          center = new_center;
+        }
+
+        /* parallel prefix sum to compute offsets for storing sub-primitives */
+        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+            for (size_t j=rn.begin(); j<rn.end(); j++)		    
+            {
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+              const unsigned int  primrefID = presplitItem[j].index;	
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+              assert(split_levels);
+              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              const size_t newID = numPrimitives + primOffset1[j-center];              
+              assert(newID+numSubPrims <= alloc_numPrimitives);
+              prims[primrefID] = subPrims[0];
+              for (size_t i=1;i<numSubPrims;i++)
+                prims[newID+i-1] = subPrims[i];
+            }
+          });
+
+        numPrimitives += offset;
+        DBG_PRESPLIT(
+          PRINT(pinfo.size());
+          PRINT(numPrimitives);
+          PRINT((float)numPrimitives/org_numPrimitives));                
+      }
+                
+      /* recompute centroid bounding boxes */
+      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+          PrimInfo p(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            p.add_center2(prims[j]);
+          return p;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+  
+      assert(pinfo.size() == numPrimitives);
+      
+      /* free double buffer presplit items */
+      alignedFree(tmp_presplitItem);		
+      alignedFree(presplitItem);
+      return pinfo;	
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h
new file mode 100644
index 0000000000..dbd6cf07c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
+                                    const Vec3fa (&inv_length)[N],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+      __forceinline void splitPolygon(const PrimRef& prim, 
+                                      const size_t dim, 
+                                      const float pos, 
+                                      const Vec3fa (&v)[N+1],
+                                      PrimRef& left_o, 
+                                      PrimRef& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+      new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+    }
+    
+    struct TriangleSplitter
+    {
+      __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );  
+        TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+        v[0] = mesh->vertex(tri.v[0]);
+        v[1] = mesh->vertex(tri.v[1]);
+        v[2] = mesh->vertex(tri.v[2]);
+        v[3] = mesh->vertex(tri.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[4];
+      Vec3fa inv_length[3];
+    };
+    
+    struct TriangleSplitterFactory
+    {
+      __forceinline TriangleSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+        return TriangleSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+    struct QuadSplitter
+    {
+      __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
+        QuadMesh::Quad quad = mesh->quad(prim.primID());
+        v[0] = mesh->vertex(quad.v[0]);
+        v[1] = mesh->vertex(quad.v[1]);
+        v[2] = mesh->vertex(quad.v[2]);
+        v[3] = mesh->vertex(quad.v[3]);
+        v[4] = mesh->vertex(quad.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[5];
+      Vec3fa inv_length[4];
+    };
+    
+    struct QuadSplitterFactory
+    {
+      __forceinline QuadSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+        return QuadSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
new file mode 100644
index 0000000000..bd102bd6ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
@@ -0,0 +1,190 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+
+namespace embree
+{
+  template<int N>
+  BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene)
+    : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN),
+      primTy(&primTy), device(scene->device), scene(scene),
+      root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0)
+  {
+  }
+
+  template<int N>
+  BVHN<N>::~BVHN ()
+  {
+    for (size_t i=0; i<objects.size(); i++) 
+      delete objects[i];
+  }
+
+  template<int N>
+  void BVHN<N>::clear()
+  {
+    set(BVHN::emptyNode,empty,0);
+    alloc.clear();
+  }
+
+  template<int N>
+  void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives)
+  {
+    this->root = root;
+    this->bounds = bounds;
+    this->numPrimitives = numPrimitives;
+  }	
+
+  template<int N>
+  void BVHN<N>::clearBarrier(NodeRef& node)
+  {
+    if (node.isBarrier())
+      node.clearBarrier();
+    else if (!node.isLeaf()) {
+      BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH
+      for (size_t c=0; c<N; c++)
+        clearBarrier(n->child(c));
+    }
+  }
+
+  template<int N>
+  void BVHN<N>::layoutLargeNodes(size_t num)
+  {
+#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+    struct NodeArea 
+    {
+      __forceinline NodeArea() {}
+
+      __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds)
+        : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {}
+
+      __forceinline bool operator< (const NodeArea& other) const {
+        return this->A < other.A;
+      }
+
+      NodeRef* node;
+      float A;
+    };
+    std::vector<NodeArea> lst;
+    lst.reserve(num);
+    lst.push_back(NodeArea(root,empty));
+
+    while (lst.size() < num)
+    {
+      std::pop_heap(lst.begin(), lst.end());
+      NodeArea n = lst.back(); lst.pop_back();
+      if (!n.node->isAABBNode()) break;
+      AABBNode* node = n.node->getAABBNode();
+      for (size_t i=0; i<N; i++) {
+        if (node->child(i) == BVHN::emptyNode) continue;
+        lst.push_back(NodeArea(node->child(i),node->bounds(i)));
+        std::push_heap(lst.begin(), lst.end());
+      }
+    }
+
+    for (size_t i=0; i<lst.size(); i++)
+      lst[i].node->setBarrier();
+      
+    root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator());
+#endif
+  }
+  
+  template<int N>
+  typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator)
+  {
+    if (node.isBarrier()) {
+      node.clearBarrier();
+      return node;
+    }
+    else if (node.isAABBNode()) 
+    {
+      AABBNode* oldnode = node.getAABBNode();
+      AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment);
+      *newnode = *oldnode;
+      for (size_t c=0; c<N; c++)
+        newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator);
+      return encodeNode(newnode);
+    }
+    else return node;
+  }
+
+  template<int N>
+  double BVHN<N>::preBuild(const std::string& builderName)
+  {
+    if (builderName == "") 
+      return inf;
+
+    if (device->verbosity(2))
+    {
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush;
+    }
+
+    double t0 = 0.0;
+    if (device->benchmark || device->verbosity(2)) t0 = getSeconds();
+    return t0;
+  }
+
+  template<int N>
+  void BVHN<N>::postBuild(double t0)
+  {
+    if (t0 == double(inf))
+      return;
+    
+    double dt = 0.0;
+    if (device->benchmark || device->verbosity(2)) 
+      dt = getSeconds()-t0;
+
+    std::unique_ptr<BVHNStatistics<N>> stat;
+
+    /* print statistics */
+    if (device->verbosity(2))
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      const size_t usedBytes = alloc.getUsedBytes();
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl;
+    
+      if (device->verbosity(2))
+        std::cout << stat->str();
+
+      if (device->verbosity(2))
+      {
+        FastAllocator::AllStatistics stat(&alloc);
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i])
+            stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc);
+
+        stat.print(numPrimitives);
+      }
+
+      if (device->verbosity(3))
+      {
+        alloc.print_blocks();
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i]) 
+            objects[i]->alloc.print_blocks();
+      }
+
+      std::cout << std::flush;
+    }
+
+    /* benchmark mode */
+    if (device->benchmark)
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush;
+    }
+  }
+
+#if defined(__AVX__)
+  template class BVHN<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
+  template class BVHN<4>;
+#endif
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
new file mode 100644
index 0000000000..8fdf912e52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* include all node types */
+#include "bvh_node_aabb.h"
+#include "bvh_node_aabb_mb.h"
+#include "bvh_node_aabb_mb4d.h"
+#include "bvh_node_obb.h"
+#include "bvh_node_obb_mb.h"
+#include "bvh_node_qaabb.h"
+
+namespace embree
+{
+  /*! flags used to enable specific node types in intersectors */
+  enum BVHNodeFlags
+  {
+    BVH_FLAG_ALIGNED_NODE = 0x00001,
+    BVH_FLAG_ALIGNED_NODE_MB = 0x00010,
+    BVH_FLAG_UNALIGNED_NODE = 0x00100,
+    BVH_FLAG_UNALIGNED_NODE_MB = 0x01000,
+    BVH_FLAG_QUANTIZED_NODE = 0x100000,
+    BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000,
+    
+    /* short versions */
+    BVH_AN1 = BVH_FLAG_ALIGNED_NODE,
+    BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB,
+    BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_UN1 = BVH_FLAG_UNALIGNED_NODE,
+    BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE,
+    BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_QN1 = BVH_FLAG_QUANTIZED_NODE
+  };
+  
+  /*! Multi BVH with N children. Each node stores the bounding box of
+   * it's N children as well as N child references. */
+  template<int N>
+    class BVHN : public AccelData
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    
+    /*! forward declaration of node ref type */
+    typedef NodeRefPtr<N> NodeRef;
+    typedef BaseNode_t<NodeRef,N> BaseNode;
+    typedef AABBNode_t<NodeRef,N> AABBNode;
+    typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB;
+    typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D;
+    typedef OBBNode_t<NodeRef,N> OBBNode;
+    typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB;
+    typedef QuantizedBaseNode_t<N> QuantizedBaseNode;
+    typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB;
+    typedef QuantizedNode_t<NodeRef,N> QuantizedNode;
+    
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+    
+    /*! Empty node */
+    static const size_t emptyNode = NodeRef::emptyNode;
+    
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = NodeRef::invalidNode;
+    static const size_t popRay      = NodeRef::popRay;
+    
+    /*! Maximum depth of the BVH. */
+    static const size_t maxBuildDepth = 32;
+    static const size_t maxBuildDepthLeaf = maxBuildDepth+8;
+    static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder
+    
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks;
+    
+  public:
+    
+    /*! Builder interface to create allocator */
+    struct CreateAlloc : public FastAllocator::Create {
+      __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
+    };
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+  public:
+    
+    /*! BVHN default constructor. */
+    BVHN (const PrimitiveType& primTy, Scene* scene);
+    
+    /*! BVHN destruction */
+    ~BVHN ();
+    
+    /*! clears the acceleration structure */
+    void clear();
+    
+    /*! sets BVH members after build */
+    void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives);
+    
+    /*! Clears the barrier bits of a subtree. */
+    void clearBarrier(NodeRef& node);
+    
+    /*! lays out num large nodes of the BVH */
+    void layoutLargeNodes(size_t num);
+    NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator);
+    
+    /*! called by all builders before build starts */
+    double preBuild(const std::string& builderName);
+    
+    /*! called by all builders after build ended */
+    void postBuild(double t0);
+    
+    /*! allocator class */
+    struct Allocator {
+      BVHN* bvh;
+      Allocator (BVHN* bvh) : bvh(bvh) {}
+      __forceinline void* operator() (size_t bytes) const { 
+        return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); 
+      }
+    };
+    
+    /*! post build cleanup */
+    void cleanup() {
+      alloc.cleanup();
+    }
+    
+  public:
+    
+    /*! Encodes a node */
+    static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); }
+    static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); }
+    
+  public:
+    
+    /*! Prefetches the node this reference points to */
+    __forceinline static void prefetch(const NodeRef ref, int types=0)
+    {
+#if defined(__AVX512PF__) // MIC
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL2(((char*)ref.ptr)+2*64);
+          prefetchL2(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* KNL still needs L2 prefetches for large nodes */
+          prefetchL2(((char*)ref.ptr)+4*64);
+          prefetchL2(((char*)ref.ptr)+5*64);
+          prefetchL2(((char*)ref.ptr)+6*64);
+          prefetchL2(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        prefetchL2(((char*)ref.ptr)+2*64);
+      }
+#else
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL1(((char*)ref.ptr)+2*64);
+          prefetchL1(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* deactivate for large nodes on Xeon, as it introduces regressions */
+          //prefetchL1(((char*)ref.ptr)+4*64);
+          //prefetchL1(((char*)ref.ptr)+5*64);
+          //prefetchL1(((char*)ref.ptr)+6*64);
+          //prefetchL1(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        prefetchL1(((char*)ref.ptr)+2*64);
+      }
+#endif
+    }
+    
+    __forceinline static void prefetchW(const NodeRef ref, int types=0)
+    {
+      embree::prefetchEX(((char*)ref.ptr)+0*64);
+      embree::prefetchEX(((char*)ref.ptr)+1*64);
+      if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+2*64);
+        embree::prefetchEX(((char*)ref.ptr)+3*64);
+      }
+      if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+4*64);
+        embree::prefetchEX(((char*)ref.ptr)+5*64);
+        embree::prefetchEX(((char*)ref.ptr)+6*64);
+        embree::prefetchEX(((char*)ref.ptr)+7*64);
+      }
+    }
+    
+    /*! bvh type information */
+  public:
+    const PrimitiveType* primTy;       //!< primitive type stored in the BVH
+    
+    /*! bvh data */
+  public:
+    Device* device;                    //!< device pointer
+    Scene* scene;                      //!< scene pointer
+    NodeRef root;                      //!< root node
+    FastAllocator alloc;               //!< allocator used to allocate nodes
+    
+    /*! statistics data */
+  public:
+    size_t numPrimitives;              //!< number of primitives the BVH is build over
+    size_t numVertices;                //!< number of vertices the BVH references
+    
+    /*! data arrays for special builders */
+  public:
+    std::vector<BVHN*> objects;
+    vector_t<char,aligned_allocator<char,32>> subdiv_patches;
+  };
+  
+  typedef BVHN<4> BVH4;
+  typedef BVHN<8> BVH8;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
new file mode 100644
index 0000000000..23f4f63d45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
@@ -0,0 +1,1325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh4_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+    
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+  
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+  
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+  
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+
+  BVH4Factory::BVH4Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom);
+
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH4Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH));
+
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH));
+  }
+
+  void BVH4Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB));
+    
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk));
+    
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream));
+    
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH4Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH4Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH4Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH4Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH4Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH4Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH4Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH4Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::ROBUST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4Triangle4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8  = BVH4Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN  = BVH4Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH4Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH4Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH4Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH4Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH4Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH4Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH4Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH4Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH4VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH4ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH4InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+  
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1Intersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1Intersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1Intersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1MBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1MBIntersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1MBIntersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4v::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+  
+  Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4v::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default"     ) {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4vMB::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "sah"              ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "dynamic"          ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+    Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel);
+    Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel);
+    Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif      
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4GridMBIntersector4HybridMoeller();
+    intersectors.intersector8  = BVH4GridMBIntersector8HybridMoeller();
+    intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>");
+    
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
new file mode 100644
index 0000000000..a68227b41f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
@@ -0,0 +1,316 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH4 instantiations */
+  class BVH4Factory : public BVHFactory
+  {
+  public:
+    BVH4Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+        
+    Accel* BVH4Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST);
+    Accel* BVH4Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4QuantizedTriangle4i(Scene* scene);
+    Accel* BVH4QuantizedQuad4i(Scene* scene);
+ 
+    Accel* BVH4SubdivPatch1(Scene* scene);
+    Accel* BVH4SubdivPatch1MB(Scene* scene);
+
+    Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4UserGeometryMB(Scene* scene);
+
+    Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+    
+  private:
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh);
+    Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
+    
+    Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
+    Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    
+  private:
+
+    DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+        
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+       
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // spatial scene builder
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
new file mode 100644
index 0000000000..9fe057c392
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
@@ -0,0 +1,1165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8
+
+#if defined (EMBREE_TARGET_SIMD8)
+
+#include "bvh8_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+  
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom);
+    
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH8Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH));
+  }
+
+  void BVH8Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+
+    SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH8Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH8Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH8Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH8Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH8Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH8Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH8Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH8Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+#define ENABLE_WOOP_TEST 0
+#if ENABLE_WOOP_TEST == 0
+    //assert(ivariant == IntersectVariant::ROBUST);
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Pluecker();
+#else
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Woop();
+#endif
+
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4    = BVH8Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8    = BVH8Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16   = BVH8Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN    = BVH8Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH8Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH8Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH8Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH8Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH8Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH8Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH8Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH8Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH8VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH8ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH8InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8v::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant);
+    Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant);
+    Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         )  builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit")     builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4v::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") { // FIXME: implement
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4vMB::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "dynamic"      ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else if (scene->device->quad_builder == "morton"       ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel);
+    Builder* builder = nullptr;
+    if      (scene->device->quad_builder == "default"     ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif            
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = nullptr;
+    intersectors.intersector8  = nullptr;
+    intersectors.intersector16 = nullptr;
+    intersectors.intersectorN  = nullptr;
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder == "default") {
+      builder = BVH8GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>");
+
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder_mb == "default") {
+      builder = BVH8GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
new file mode 100644
index 0000000000..b92188e7d3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
@@ -0,0 +1,280 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH8 instantiations */
+  class BVH8Factory : public BVHFactory
+  {
+  public:
+    BVH8Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+    
+    Accel* BVH8Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8QuantizedTriangle4i(Scene* scene);
+    Accel* BVH8QuantizedTriangle4(Scene* scene);
+    Accel* BVH8QuantizedQuad4i(Scene* scene);
+
+    Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8UserGeometryMB(Scene* scene);
+
+    Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+
+  private:
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+  private:
+    DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // SAH spatial scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
new file mode 100644
index 0000000000..e832537ec5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+      
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+            
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template<int N>
+    typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB {
+        return createLeaf(prims,set,alloc);
+      };
+
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRecordMB>
+        (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template struct BVHNBuilderVirtual<4>;
+    template struct BVHNBuilderQuantizedVirtual<4>;
+    template struct BVHNBuilderMblurVirtual<4>;    
+
+#if defined(__AVX__)
+    template struct BVHNBuilderVirtual<8>;
+    template struct BVHNBuilderQuantizedVirtual<8>;
+    template struct BVHNBuilderMblurVirtual<8>;
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
new file mode 100644
index 0000000000..1b86bb45ad
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "../builders/bvh_builder_sah.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N>
+      struct BVHNBuilderVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderQuantizedVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderMblurVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::AABBNodeMB AABBNodeMB;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef typename BVH::NodeRecordMB NodeRecordMB;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange);
+          virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
new file mode 100644
index 0000000000..64759c1294
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
@@ -0,0 +1,531 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+#include "bvh_rotate.h"
+#include "../common/profile.h"
+#include "../../common/algorithms/parallel_prefix_sum.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/bvh_builder_morton.h"
+
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+#if defined(__X86_64__) || defined(__aarch64__)
+#  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
+#else
+#  define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#endif
+
+namespace embree 
+{
+  namespace isa
+  {
+    template<int N>
+    struct SetBVHNBounds
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+      typedef typename BVH::AABBNode AABBNode;
+
+      BVH* bvh;
+      __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num)
+      {
+        AABBNode* node = ref.getAABBNode();
+
+        BBox3fa res = empty;
+        for (size_t i=0; i<num; i++) {
+          const BBox3fa b = children[i].bounds;
+          res.extend(b);
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,b);
+        }
+
+        BBox3fx result = (BBox3fx&)res;
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          size_t n = 0;
+          for (size_t i=0; i<num; i++)
+            n += children[i].bounds.lower.a;
+
+          if (n >= 4096) {
+            for (size_t i=0; i<num; i++) {
+              if (children[i].bounds.lower.a < 4096) {
+                for (int j=0; j<ROTATE_TREE; j++)
+                  BVHNRotate<N>::rotate(node->child(i));
+                node->child(i).setBarrier();
+              }
+            }
+          }
+          result.lower.a = unsigned(n);
+        }
+#endif
+
+        return NodeRecord(ref,result);
+      }
+    };
+
+    template<int N, typename Primitive>
+    struct CreateMortonLeaf;
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+
+        Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = unsigned(current.size());
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+    
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);       
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+        Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4i>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 v0 = zero, v1 = zero, v2 = zero;
+        vuint4 vgeomID = -1, vprimID = -1;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+        
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID[i] = geomID_;
+          vprimID[i] = primID;
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride; 
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+        }
+        
+        for (size_t i=items; i<4; i++)
+        {
+          vgeomID[i] = vgeomID[0];
+          vprimID[i] = -1;
+          v0[i] = 0;
+          v1[i] = 0; 
+          v2[i] = 0;
+        }
+        Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Quad4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+        const QuadMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const QuadMesh::Quad& tri = mesh->quad(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          const Vec3fa& p3 = mesh->vertex(tri.v[3]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+          v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+        }
+        Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      QuadMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Object>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        
+        /* allocate leaf node */
+        Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const UserGeometry* mesh = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int index = morton[start+i].index;
+          const unsigned int primID = index; 
+          bounds.extend(mesh->bounds(primID));
+          new (&accel[i]) Object(geomID_,primID);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      UserGeometry* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,InstancePrimitive>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items <= 1);
+        
+        /* allocate leaf node */
+        InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const Instance* instance = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index; 
+          bounds.extend(instance->bounds(primID));
+          new (&accel[i]) InstancePrimitive(instance, geomID_);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      Instance* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<typename Mesh>
+    struct CalculateMeshBounds
+    {
+      __forceinline CalculateMeshBounds (Mesh* mesh)
+        : mesh(mesh) {}
+      
+      __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) {
+        return mesh->bounds(morton.index);
+      }
+      
+    private:
+      Mesh* mesh;
+    };        
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNMeshBuilderMorton : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+    public:
+      
+      BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD)
+        : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {}
+      
+      /* build function */
+      void build() 
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+          morton.clear();
+        }
+        size_t numPrimitives = mesh->size();
+        numPreviousPrimitives = numPrimitives;
+        
+        /* skip build for empty scene */
+        if (numPrimitives == 0) {
+          bvh->set(BVH::emptyNode,empty,0);
+          return;
+        }
+        
+        /* preallocate arrays */
+        morton.resize(numPrimitives);
+        size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+        size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim);
+        bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes
+        bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated);
+
+        /* create morton code array */
+        BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes);
+        size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface);
+
+        /* create BVH */
+        SetBVHNBounds<N> setBounds(bvh);
+        CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data());
+        CalculateMeshBounds<Mesh> calculateBounds(mesh);
+        auto root = BVHBuilderMorton::build<NodeRecord>(
+          typename BVH::CreateAlloc(bvh), 
+          typename BVH::AABBNode::Create(),
+          setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface,
+          morton.data(),dest,numPrimitivesGen,settings);
+        
+        bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives);
+        
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          for (int i=0; i<ROTATE_TREE; i++)
+            BVHNRotate<N>::rotate(bvh->root);
+          bvh->clearBarrier(bvh->root);
+        }
+#endif
+
+        /* clear temporary data for static geometry */
+        if (bvh->scene->isStaticAccel()) {
+          morton.clear();
+        }
+        bvh->cleanup();
+      }
+      
+      void clear() {
+        morton.clear();
+      }
+      
+    private:
+      BVH* bvh;
+      Mesh* mesh;
+      mvector<BVHBuilderMorton::BuildPrim> morton;
+      BVHBuilderMorton::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+    };
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
new file mode 100644
index 0000000000..cf5b2eb47f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
@@ -0,0 +1,640 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define PROFILE 0
+#define PROFILE_RUNS 20
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafQuantized
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+      bool primrefarrayalloc;
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize,
+                      const Geometry::GTypeMask gtype, bool primrefarrayalloc = false)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0),
+          settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {}
+
+      BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            /* create primref array */
+            if (primrefarrayalloc) {
+              settings.primrefarrayalloc = numPrimitives/1000;
+              if (settings.primrefarrayalloc < 1000)
+                settings.primrefarrayalloc = inf;
+            }
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* initialize allocator */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            prims.resize(numPrimitives); 
+
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+            /* pinfo might has zero size due to invalid geometry */
+            if (unlikely(pinfo.size() == 0))
+            {
+              bvh->clear();
+              prims.clear();
+              return;
+            }
+
+            /* call BVH builder */
+            NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+#if PROFILE
+          });
+#endif
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAHQuantized : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {}
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          prims.clear();
+          bvh->clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+            /* create primref array */
+            prims.resize(numPrimitives);
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* call BVH builder */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!!
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); //Primitive::blocks(n);
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs);
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()];
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            bounds[pos] = prims[start+i].bounds();
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos);
+        }
+
+        return node;
+      }
+
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+
+    template<int N>
+    struct BVHNBuilderSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      
+      BVH* bvh;
+      Scene* scene;
+      GridMesh* mesh;
+      mvector<PrimRef> prims;
+      mvector<SubGridBuildData> sgrids;
+      GeneralBVHBuilder::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {}
+
+      BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {}
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+        
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+        const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false);
+        numPreviousPrimitives = numGridPrimitives;
+               
+        PrimInfo pinfo(empty);
+        size_t numPrimitives = 0;
+
+        if (!mesh)
+        {
+          /* first run to get #primitives */
+
+          ParallelForForPrefixSumState<PrimInfo> pstate;
+          Scene::Iterator<GridMesh,false> iter(scene);
+
+          pstate.init(iter,size_t(1024));
+
+          /* iterate over all meshes in the scene */
+          pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+              PrimInfo pinfo(empty);
+              for (size_t j=r.begin(); j<r.end(); j++)
+              {
+                if (!mesh->valid(j)) continue;
+                BBox3fa bounds = empty;
+                const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j);
+                if (!mesh->valid(j)) continue;
+                pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+              }
+              return pinfo;
+            }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          numPrimitives = pinfo.size();
+          
+          /* resize arrays */
+          sgrids.resize(numPrimitives); 
+          prims.resize(numPrimitives); 
+
+          /* second run to fill primrefs and SubGridBuildData arrays */
+          pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+              k = base.size();
+              size_t p_index = k;
+              PrimInfo pinfo(empty);
+              for (size_t j=r.begin(); j<r.end(); j++)
+              {
+                if (!mesh->valid(j)) continue;
+                const GridMesh::Grid &g = mesh->grid(j);
+                for (unsigned int y=0; y<g.resY-1u; y+=2)
+                  for (unsigned int x=0; x<g.resX-1u; x+=2)
+                  {
+                    BBox3fa bounds = empty;
+                    if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                    const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
+                    pinfo.add_center2(prim);
+                    sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                    prims[p_index++] = prim;                
+                  }
+              }
+              return pinfo;
+            }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          assert(pinfo.size() == numPrimitives);
+        }
+        else
+        {
+          ParallelPrefixSumState<PrimInfo> pstate;
+          /* iterate over all grids in a single mesh */
+          pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                       {
+                                         PrimInfo pinfo(empty);
+                                         for (size_t j=r.begin(); j<r.end(); j++)
+                                         {
+                                           if (!mesh->valid(j)) continue;
+                                           BBox3fa bounds = empty;
+                                           const PrimRef prim(bounds,geomID_,unsigned(j));
+                                           pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+                                         }
+                                         return pinfo;
+                                       }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          numPrimitives = pinfo.size();
+          /* resize arrays */
+          sgrids.resize(numPrimitives); 
+          prims.resize(numPrimitives); 
+
+          /* second run to fill primrefs and SubGridBuildData arrays */
+          pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                       {
+
+                                         size_t p_index = base.size();
+                                         PrimInfo pinfo(empty);
+                                         for (size_t j=r.begin(); j<r.end(); j++)
+                                         {
+                                           if (!mesh->valid(j)) continue;
+                                           const GridMesh::Grid &g = mesh->grid(j);
+                                           for (unsigned int y=0; y<g.resY-1u; y+=2)
+                                             for (unsigned int x=0; x<g.resX-1u; x+=2)
+                                             {
+                                               BBox3fa bounds = empty;
+                                               if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                                               const PrimRef prim(bounds,geomID_,unsigned(p_index));
+                                               pinfo.add_center2(prim);
+                                               sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                               prims[p_index++] = prim;                
+                                             }
+                                         }
+                                         return pinfo;
+                                       }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+        }
+
+        /* no primitives */
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          sgrids.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+        /* create primref array */
+        settings.primrefarrayalloc = numPrimitives/1000;
+        if (settings.primrefarrayalloc < 1000)
+          settings.primrefarrayalloc = inf;
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+
+        /* initialize allocator */
+        const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+
+        /* pinfo might has zero size due to invalid geometry */
+        if (unlikely(pinfo.size() == 0))
+        {
+          bvh->clear();
+          sgrids.clear();
+          prims.clear();
+          return;
+        }
+
+        /* call BVH builder */
+        NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+        /* clear temporary array */
+        sgrids.clear();
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH4Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+
+
+    Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH8Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+    Builder* BVH8QuantizedTriangle4iSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8QuantizedTriangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH4QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH8QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+
+    Builder* BVH4VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH4VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#if defined(__AVX__)
+
+    Builder* BVH8VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH8VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
+    }
+#if defined(__AVX__)
+    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); }
+    Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct
+
+#if defined(__AVX__)
+    Builder* BVH8GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); }
+    Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
new file mode 100644
index 0000000000..9c01553ec6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -0,0 +1,705 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+
+namespace embree
+{
+  namespace isa
+  {
+
+#if 0
+    template<int N, typename Primitive>
+    struct CreateMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(set.size());
+        size_t start = set.begin();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time));
+
+        return NodeRecordMB(node,allBounds);
+      }
+
+      BVH* bvh;
+      PrimRef* prims;
+      size_t time;
+    };
+#endif
+
+    template<int N, typename Mesh, typename Primitive>
+    struct CreateMSMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(current.prims.size());
+        size_t start = current.prims.begin();
+        size_t end   = current.prims.end();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range));
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+      }
+
+      BVH* bvh;
+    };
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N, typename Mesh, typename Primitive>
+    struct BVHNBuilderMBlurSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      const Geometry::GTypeMask gtype_;
+
+      BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {}
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(gtype_,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>();
+            //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+
+            /*if (numTimeSegments == 1)
+              buildSingleSegment(numPrimitives);
+              else*/
+              buildMultiSegment(numPrimitives);
+
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf.
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(),
+           CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = Primitive::singleTimeSegment;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            RecalculatePrimRef<Mesh>(scene),
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh),
+                                            bvh->scene->progressInterface,
+                                            settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    struct GridRecalculatePrimRef
+    {
+      Scene* scene;
+      const SubGridBuildData * const sgrids;
+
+      __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids)
+        : scene(scene), sgrids(sgrids) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+          const unsigned num_time_segments = mesh->numTimeSegments();
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          return mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+        }
+
+    };
+
+    template<int N>
+    struct CreateMSMBlurLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = current.prims.size(); 
+        const size_t start = current.prims.begin();
+
+        const PrimRefMB* prims = current.prims.prims->data();
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range);
+            allBounds.extend(newBounds);
+            bounds0[pos] = newBounds.bounds0;
+            bounds1[pos] = newBounds.bounds1;
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos);
+        }
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);       
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+#if 0
+    template<int N>
+    struct CreateLeafGridMB
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) 
+		  : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); 
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]);
+            bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]);
+            assert(valid0);
+            assert(valid1);
+            allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos]));
+            pos++;
+          }
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos);
+        }
+        return NodeRecordMB(node,allBounds);
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+#endif
+
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N>
+    struct BVHNBuilderMBlurSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      mvector<SubGridBuildData> sgrids;
+
+
+      BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {}
+
+
+      PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfo> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+
+        /* iterate over all meshes in the scene */
+        PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+            
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              BBox3fa bounds = empty;
+              const PrimRef prim(bounds,unsigned(geomID),unsigned(j));
+              pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        size_t numPrimitives = pinfo.size();
+        if (numPrimitives == 0) return pinfo;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              const GridMesh::Grid &g = mesh->grid(j);
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  BBox3fa bounds = empty;
+                  if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid
+                  const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index));
+                  pinfo.add_center2(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                                      prims[p_index++] = prim;                
+                }
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        assert(pinfo.size() == numPrimitives);
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f))
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfoMB> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+        /* iterate over all meshes in the scene */
+        PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+            
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              LBBox3fa bounds(empty);
+              PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+              pinfoMB.merge(gridMB);
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        size_t numPrimitives = pinfoMB.size();
+        if (numPrimitives == 0) return pinfoMB;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              const GridMesh::Grid &g = mesh->grid(j);
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index));
+                  pinfoMB.add_primref(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                  prims[p_index++] = prim;                
+                }
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        assert(pinfoMB.size() == numPrimitives);
+        pinfoMB.time_range = t0t1;
+        return pinfoMB;
+      }
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid");
+
+        //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>();
+        //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+        //if (numTimeSegments == 1)
+        //  buildSingleSegment(numPrimitives);
+        //else
+        buildMultiSegment(numPrimitives);
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        //TODO: check leaf_bytes
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),
+           typename BVH::AABBNodeMB::Create(),
+           typename BVH::AABBNodeMB::Set(),
+           CreateLeafGridMB<N>(scene,bvh,sgrids.data()),
+           bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+      
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+
+
+        GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data());
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        //FIXME: check leaf_bytes
+        //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>));
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = false; 
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            recalculatePrimRef,
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()),
+                                            bvh->scene->progressInterface,
+                                            settings);
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); }
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
new file mode 100644
index 0000000000..285b38c39d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/primrefgen_presplit.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeafSpatial
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    template<int N, typename Mesh, typename Primitive, typename Splitter>
+    struct BVHNBuilderFastSpatialSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      BVH* bvh;
+      Scene* scene;
+      Mesh* mesh;
+      mvector<PrimRef> prims0;
+      GeneralBVHBuilder::Settings settings;
+      const float splitFactor;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications) {}
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false);
+        numPreviousPrimitives = numOriginalPrimitives;
+        if (numOriginalPrimitives == 0) {
+          prims0.clear();
+          bvh->clear();
+          return;
+        }
+
+        const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>();
+        const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)));
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH"));
+
+        /* create primref array */
+        const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives));
+        prims0.resize(numSplitPrimitives);
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+	
+	NodeRef root(0);
+	PrimInfo pinfo;
+	
+
+        if (likely(usePreSplits))
+	  {		     
+            /* spatial presplit SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings);
+	  }
+	else
+	  {
+            /* standard spatial split SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface);
+	
+	    Splitter splitter(scene);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>(
+								  typename BVH::CreateAlloc(bvh),
+								  typename BVH::AABBNode::Create2(),
+								  typename BVH::AABBNode::Set2(),
+								  CreateLeafSpatial<N,Primitive>(bvh),
+								  splitter,
+								  bvh->scene->progressInterface,
+								  prims0.data(),
+								  numSplitPrimitives,
+								  pinfo,settings);
+
+	    /* ==================== */
+	  }
+
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims0.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims0.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+
+    Builder* BVH4Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
new file mode 100644
index 0000000000..1a78f347ac
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
@@ -0,0 +1,377 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder_twolevel.h"
+#include "bvh_statistics.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../common/scene_line_segments.h"
+#include "../common/scene_triangle_mesh.h"
+#include "../common/scene_quad_mesh.h"
+
+#define PROFILE 0
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold)
+      : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {}
+    
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () {
+    }
+
+    // ===========================================================================
+    // ===========================================================================
+    // ===========================================================================
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build()
+    {
+      /* delete some objects */
+      size_t num = scene->size();
+      if (num < bvh->objects.size()) {
+        parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) {
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              builders[i].reset();
+              delete bvh->objects[i]; bvh->objects[i] = nullptr;
+            }
+          });
+      }
+      
+#if PROFILE
+      while(1) 
+#endif
+      {
+      /* reset memory allocator */
+      bvh->alloc.reset();
+      
+      /* skip build for empty scene */
+      const size_t numPrimitives = scene->getNumPrimitives(gtype,false);
+
+      if (numPrimitives == 0) {
+        prims.resize(0);
+        bvh->set(BVH::emptyNode,empty,0);
+        return;
+      }
+
+      /* calculate the size of the entire BVH */
+      const size_t numLeafBlocks = Primitive::blocks(numPrimitives);
+      const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N;
+      const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive));
+      bvh->alloc.init_estimate(node_bytes+leaf_bytes); 
+
+      double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel");
+
+      /* resize object array if scene got larger */
+      if (bvh->objects.size()  < num) bvh->objects.resize(num);
+      if (builders.size() < num) builders.resize(num);
+      resizeRefsList ();
+      nextRef.store(0);
+      
+      /* create acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+      
+          /* ignore meshes we do not support */
+          if (mesh == nullptr || mesh->numTimeSteps != 1)
+            continue;
+          
+          if (isSmallGeometry(mesh)) {
+             setupSmallBuildRefBuilder (objectID, mesh);
+          } else {
+            setupLargeBuildRefBuilder (objectID, mesh);
+          }
+        }
+      });
+
+      /* parallel build of acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          /* ignore if no triangle mesh or not enabled */
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+          if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) 
+            continue;
+
+          builders[objectID]->attachBuildRefs (this);
+        }
+      });
+
+
+#if PROFILE
+      double d0 = getSeconds();
+#endif
+      /* fast path for single geometry scenes */
+      if (nextRef == 1) { 
+        bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives);
+      }
+
+      else
+      {     
+        /* open all large nodes */
+        refs.resize(nextRef);
+
+        /* this probably needs some more tuning */
+        const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR));
+ 
+#if !ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+#if ENABLE_OPEN_SEQUENTIAL
+        open_sequential(extSize); 
+#endif
+        /* compute PrimRefs */
+        prims.resize(refs.size());
+#endif
+        
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+        tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount()));
+        limited.execute([&]
+#endif
+        {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+          
+#else
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+                prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+#endif   
+       
+          /* skip if all objects where empty */
+          if (pinfo.size() == 0)
+            bvh->set(BVH::emptyNode,empty,0);
+        
+          /* otherwise build toplevel hierarchy */
+          else
+          {
+            /* settings for BVH build */
+            GeneralBVHBuilder::Settings settings;
+            settings.branchingFactor = N;
+            settings.maxDepth = BVH::maxBuildDepthLeaf;
+            settings.logBlockSize = bsr(N);
+            settings.minLeafSize = 1;
+            settings.maxLeafSize = 1;
+            settings.travCost = 1.0f;
+            settings.intCost = 1.0f;
+            settings.singleThreadThreshold = singleThreadThreshold;
+      
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            
+            refs.resize(extSize); 
+         
+            NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef  {
+                assert(range.size() == 1);
+                return (NodeRef) refs[range.begin()].node;
+              },
+              [&] (BuildRef &bref, BuildRef *refs) -> size_t { 
+                return openBuildRef(bref,refs);
+              },              
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              refs.data(),extSize,pinfo,settings);
+#else
+            NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+                assert(range.size() == 1);
+                return (NodeRef) prims[range.begin()].ID();
+              },
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              prims.data(),pinfo,settings);
+#endif
+
+            
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives);
+          }
+        }
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+          );
+#endif
+
+      }  
+        
+      bvh->alloc.cleanup();
+      bvh->postBuild(t0);
+#if PROFILE
+      double d1 = getSeconds();
+      std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl;
+#endif
+      }
+
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID)
+    {
+      if (geomID >= bvh->objects.size()) return;
+      if (builders[geomID]) builders[geomID].reset();
+      delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr;
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear()
+    {
+      for (size_t i=0; i<bvh->objects.size(); i++) 
+        if (bvh->objects[i]) bvh->objects[i]->clear();
+
+      for (size_t i=0; i<builders.size(); i++) 
+        if (builders[i]) builders[i].reset();
+
+      refs.clear();
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize)
+    {
+      if (refs.size() == 0)
+	return;
+
+      refs.reserve(extSize);
+
+#if 1
+      for (size_t i=0;i<refs.size();i++)
+      {
+        NodeRef ref = refs[i].node;
+        if (ref.isAABBNode())
+          BVH::prefetch(ref);
+      }
+#endif
+
+      std::make_heap(refs.begin(),refs.end());
+      while (refs.size()+N-1 <= extSize)
+      {
+        std::pop_heap (refs.begin(),refs.end()); 
+        NodeRef ref = refs.back().node;
+        if (ref.isLeaf()) break;
+        refs.pop_back();    
+        
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs.push_back(BuildRef(node->bounds(i),node->child(i)));
+         
+#if 1
+          NodeRef ref_pre = node->child(i);
+          if (ref_pre.isAABBNode())
+            ref_pre.prefetch();
+#endif
+          std::push_heap (refs.begin(),refs.end()); 
+        }
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/)
+    {
+      if (builders[objectID] == nullptr ||                                         // new mesh
+          dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr)     // size change resulted in large->small change
+      {
+        builders[objectID].reset (new RefBuilderSmall(objectID));
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh)
+    {
+      if (bvh->objects[objectID] == nullptr ||                                  // new mesh
+          builders[objectID]->meshQualityChanged (mesh->quality) ||             // changed build quality
+          dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr)  // size change resulted in small->large change
+      {
+        Builder* builder = nullptr;
+        delete bvh->objects[objectID]; 
+        createMeshAccel(objectID, builder);
+        builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality));
+      }
+    }
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#if defined(__AVX__)
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
new file mode 100644
index 0000000000..8f57c3b406
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+
+#include "bvh_builder_twolevel_internal.h"
+#include "bvh.h"
+#include "../common/primref.h"
+#include "../builders/priminfo.h"
+#include "../builders/primrefgen.h"
+
+/* new open/merge builder */
+#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1
+#define ENABLE_OPEN_SEQUENTIAL 0
+#define SPLIT_MEMORY_RESERVE_FACTOR 1000
+#define SPLIT_MEMORY_RESERVE_SCALE 2
+#define SPLIT_MIN_EXT_SPACE 1000
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNBuilderTwoLevel : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline static bool isSmallGeometry(Mesh* mesh) {
+        return mesh->size() <= 4;
+      }
+
+    public:
+
+      typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+      struct BuildRef : public PrimRef
+      {
+      public:
+        __forceinline BuildRef () {}
+
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node)
+          : PrimRef(bounds,(size_t)node), node(node)
+        {
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        /* used by the open/merge bvh builder */
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives)
+          : PrimRef(bounds,geomID,numPrimitives), node(node)
+        {
+          /* important for relative buildref ordering */
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        __forceinline size_t size() const {
+          return primID();
+        }
+
+        friend bool operator< (const BuildRef& a, const BuildRef& b) {
+          return a.bounds_area < b.bounds_area;
+        }
+
+        friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) {
+          return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }";
+        }
+
+        __forceinline unsigned int numPrimitives() const { return primID(); }
+
+      public:
+        NodeRef node;
+        float bounds_area;
+      };
+
+
+      __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) {
+        if (bref.node.isLeaf())
+        {
+          refs[0] = bref;
+          return 1;
+        }
+        NodeRef ref = bref.node;
+        unsigned int geomID   = bref.geomID();
+        unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1);
+        AABBNode* node = ref.getAABBNode();
+        size_t n = 0;
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims);
+          n++;
+        }
+        assert(n > 1);
+        return n;        
+      }
+      
+      /*! Constructor. */
+      BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD);
+      
+      /*! Destructor */
+      ~BVHNBuilderTwoLevel ();
+      
+      /*! builder entry point */
+      void build();
+      void deleteGeometry(size_t geomID);
+      void clear();
+
+      void open_sequential(const size_t extSize);
+      
+    private:
+
+      class RefBuilderBase {
+      public:
+        virtual ~RefBuilderBase () {}
+        virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0;
+        virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0;
+      };
+
+      class RefBuilderSmall : public RefBuilderBase {
+      public:
+
+        RefBuilderSmall (size_t objectID)
+          : objectID_ (objectID) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) {
+
+          Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_);
+          size_t meshSize = mesh->size();
+          assert(isSmallGeometry(mesh));
+          
+          mvector<PrimRef> prefs(topBuilder->scene->device, meshSize);
+          auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface);
+
+          size_t begin=0;
+          while (begin < pinfo.size())
+          {
+            Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment);
+            typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1);
+            accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene);
+            
+            /* create build primitive */
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1);
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node);
+#endif
+          }
+          assert(begin == pinfo.size());
+        }
+
+        bool meshQualityChanged (RTCBuildQuality /*currQuality*/) {
+          return false;
+        }
+        
+        size_t  objectID_;
+      };
+
+      class RefBuilderLarge : public RefBuilderBase {
+      public:
+        
+        RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality)
+        : objectID_ (objectID), builder_ (builder), quality_ (quality) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder)
+        {
+          BVH* object  = topBuilder->getBVH(objectID_); assert(object);
+          
+          /* build object if it got modified */
+          if (topBuilder->isGeometryModified(objectID_))
+            builder_->build();
+
+          /* create build primitive */
+          if (!object->getBounds().empty())
+          {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            Mesh* mesh = topBuilder->getMesh(objectID_);
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size());
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root);
+#endif
+          }
+        }
+
+        bool meshQualityChanged (RTCBuildQuality currQuality) {
+          return currQuality != quality_;
+        }
+
+      private:
+        size_t          objectID_;
+        Ref<Builder>    builder_;
+        RTCBuildQuality quality_;
+      };
+
+      void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+      void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+
+      BVH*  getBVH (size_t objectID) {
+        return this->bvh->objects[objectID];
+      }
+      Mesh* getMesh (size_t objectID) {
+        return this->scene->template getSafe<Mesh>(objectID);
+      }
+      bool  isGeometryModified (size_t objectID) {
+        return this->scene->isGeometryModified(objectID);
+      }
+
+      void resizeRefsList ()
+      {
+        size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), 
+          [this](const range<size_t>& r)->size_t {
+            size_t c = 0;
+            for (auto i=r.begin(); i<r.end(); ++i) {
+              Mesh* mesh = scene->getSafe<Mesh>(i);
+              if (mesh == nullptr || mesh->numTimeSteps != 1)
+                continue;
+              size_t meshSize = mesh->size();
+              c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1;
+            }
+            return c;
+          },
+          std::plus<size_t>()
+        );
+
+        if (refs.size() < num) {
+          refs.resize(num);
+        }
+      }
+
+      void createMeshAccel (size_t geomID, Builder*& builder)
+      {
+        bvh->objects[geomID] = new BVH(Primitive::type,scene);
+        BVH* accel = bvh->objects[geomID];
+        auto mesh = scene->getSafe<Mesh>(geomID);
+        if (nullptr == mesh) {
+          throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type");
+          return;
+        }
+
+        __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder);
+      }      
+
+      using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>;
+
+      BuilderList         builders;
+      BVH*                bvh;
+      Scene*              scene;      
+      mvector<BuildRef>   refs;
+      mvector<PrimRef>    prims;
+      std::atomic<int>    nextRef;
+      const size_t        singleThreadThreshold;
+      Geometry::GTypeMask gtype;
+      bool                useMortonBuilder_ = false;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
new file mode 100644
index 0000000000..1c1ae8d6a7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -0,0 +1,267 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
+  
+  namespace isa
+  {
+
+    namespace __internal_two_level_builder__ {
+
+      template<int N, typename Mesh, typename Primitive>
+      struct MortonBuilder {};
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct SAHBuilder {};
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct RefitBuilder {};
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      
+      template<int N, typename Mesh, typename Primitive>
+      struct MeshBuilder {
+        MeshBuilder () {}
+        void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) {
+          if(useMortonBuilder) {
+            builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype);
+            return;
+          }
+          switch (mesh->quality) {
+            case RTC_BUILD_QUALITY_LOW:    builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_MEDIUM:
+            case RTC_BUILD_QUALITY_HIGH:   builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_REFIT:  builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality");
+          }
+        }
+      };
+    }
+  }
+}
+\ No newline at end of file
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
new file mode 100644
index 0000000000..a27be8bae8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
@@ -0,0 +1,375 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_collider.h"
+#include "../geometry/triangle_triangle_intersector.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+#define CSTAT(x)
+
+    size_t parallel_depth_threshold = 3;
+    CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0));
+
+    struct Collision
+    {
+      __forceinline Collision() {}
+
+      __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1)
+        : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {}
+
+      unsigned geomID0;
+      unsigned primID0;
+      unsigned geomID1;
+      unsigned primID1;
+    };
+    
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1)
+    {
+      CSTAT(bvh_collide_prim_intersections1++);
+      const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0);
+      const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1);
+      const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0);
+      const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1);
+      
+      /* special culling for scene intersection with itself */
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore self intersections */
+        if (primID0 == primID1)
+          return false;
+      }
+      CSTAT(bvh_collide_prim_intersections2++);
+      
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore intersection with topological neighbors */
+        const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]);
+        if (any(vint4(tri1.v[0]) == t0)) return false;
+        if (any(vint4(tri1.v[1]) == t0)) return false;
+        if (any(vint4(tri1.v[2]) == t0)) return false;
+      }
+      CSTAT(bvh_collide_prim_intersections3++);
+      
+      const Vec3fa a0 = mesh0->vertex(tri0.v[0]);
+      const Vec3fa a1 = mesh0->vertex(tri0.v[1]);
+      const Vec3fa a2 = mesh0->vertex(tri0.v[2]);
+      const Vec3fa b0 = mesh1->vertex(tri1.v[0]);
+      const Vec3fa b1 = mesh1->vertex(tri1.v[1]);
+      const Vec3fa b2 = mesh1->vertex(tri1.v[2]);
+      
+      return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2);
+    }
+    
+    template<int N>
+    __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1)
+    {
+      Collision collisions[16];
+      size_t num_collisions = 0;
+
+      size_t N0; Object* leaf0 = (Object*) node0.leaf(N0);
+      size_t N1; Object* leaf1 = (Object*) node1.leaf(N1);
+      for (size_t i=0; i<N0; i++) {
+        for (size_t j=0; j<N1; j++) {
+          const unsigned geomID0 = leaf0[i].geomID();
+          const unsigned primID0 = leaf0[i].primID();
+          const unsigned geomID1 = leaf1[j].geomID();
+          const unsigned primID1 = leaf1[j].primID();
+          if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue;
+          collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1);
+          if (num_collisions == 16) {
+            this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+            num_collisions = 0;
+          }
+        }
+      }
+      if (num_collisions)
+        this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+    }
+
+    template<int N>
+    void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1)
+    {
+      CSTAT(bvh_collide_traversal_steps++);
+      if (unlikely(ref0.isLeaf())) {
+        if (unlikely(ref1.isLeaf())) {
+          CSTAT(bvh_collide_leaf_pairs++);
+          processLeaf(ref0,ref1);
+          return;
+        } else goto recurse_node1;
+        
+      } else {
+        if (unlikely(ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(bounds0) > area(bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+
+      {
+      recurse_node0:
+        AABBNode* node0 = ref0.getAABBNode();
+        size_t mask = overlap<N>(bounds1,*node0);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth0 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+              }
+            });
+        } 
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+          }
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        AABBNode* node1 = ref1.getAABBNode();
+        size_t mask = overlap<N>(bounds0,*node1);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth1 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+              }
+            });
+        }
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+          }
+        }
+        return;
+      }
+    }
+
+    template<int N>
+    void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs)
+    {
+      if (unlikely(job.ref0.isLeaf())) {
+        if (unlikely(job.ref1.isLeaf())) {
+          jobs.push_back(job);
+          return;
+        } else goto recurse_node1;
+      } else {
+        if (unlikely(job.ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(job.bounds0) > area(job.bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+      
+      {
+      recurse_node0:
+        const AABBNode* node0 = job.ref0.getAABBNode();
+        size_t mask = overlap<N>(job.bounds1,*node0);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1));
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        const AABBNode* node1 = job.ref1.getAABBNode();
+        size_t mask = overlap<N>(job.bounds0,*node1);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1));
+        }
+        return;
+      }
+    }
+    
+    template<int N>
+    void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1)
+    {
+      CSTAT(bvh_collide_traversal_steps = 0);
+      CSTAT(bvh_collide_leaf_pairs = 0);
+      CSTAT(bvh_collide_leaf_iterations = 0);
+      CSTAT(bvh_collide_prim_intersections1 = 0);
+      CSTAT(bvh_collide_prim_intersections2 = 0);
+      CSTAT(bvh_collide_prim_intersections3 = 0);
+      CSTAT(bvh_collide_prim_intersections4 = 0);
+      CSTAT(bvh_collide_prim_intersections5 = 0);
+      CSTAT(bvh_collide_prim_intersections = 0);
+#if 0
+      collide_recurse(ref0,bounds0,ref1,bounds1,0,0);
+#else
+      const int M = 2048;
+      jobvector jobs[2];
+      jobs[0].reserve(M);
+      jobs[1].reserve(M);
+      jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0));
+      int source = 0;
+      int target = 1;
+
+      /* try to split job until job list is full */
+      while (jobs[source].size()+8 <= M)
+      {
+        for (size_t i=0; i<jobs[source].size(); i++)
+        {
+          const CollideJob& job = jobs[source][i];
+          size_t remaining = jobs[source].size()-i;
+          if (jobs[target].size()+remaining+8 > M) {
+            jobs[target].push_back(job);
+          } else {
+            split(job,jobs[target]);
+          }
+        }
+
+        /* stop splitting jobs if we reached only leaves and cannot make progress anymore */
+        if (jobs[target].size() == jobs[source].size())
+          break;
+
+        jobs[source].resize(0);
+        std::swap(source,target);
+      }
+
+      /* parallel processing of all jobs */
+      parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) {
+          CollideJob& j = jobs[source][i];
+          collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1);
+        });
+      
+      
+#endif
+      CSTAT(PRINT(bvh_collide_traversal_steps));
+      CSTAT(PRINT(bvh_collide_leaf_pairs));
+      CSTAT(PRINT(bvh_collide_leaf_iterations));
+      CSTAT(PRINT(bvh_collide_prim_intersections1));
+      CSTAT(PRINT(bvh_collide_prim_intersections2));
+      CSTAT(PRINT(bvh_collide_prim_intersections3));
+      CSTAT(PRINT(bvh_collide_prim_intersections4));
+      CSTAT(PRINT(bvh_collide_prim_intersections5));
+      CSTAT(PRINT(bvh_collide_prim_intersections));
+    }
+   
+    template<int N>
+    void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr)
+    { 
+      BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr).
+        collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds());
+    }
+
+#if defined (EMBREE_LOWEST_ISA)
+    struct collision_regression_test : public RegressionTest
+    {
+      collision_regression_test(const char* name) : RegressionTest(name) {
+        registerRegressionTest(this);
+      }
+    
+      bool run ()
+      {
+        bool passed = true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f),
+                                                                            Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false;
+        return passed;
+      }
+    };
+
+    collision_regression_test collision_regression("collision_regression_test");
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Collider Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>);
+
+#if defined(__AVX__)
+    DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
new file mode 100644
index 0000000000..ac4f99c96a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
@@ -0,0 +1,72 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/object.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+      class BVHNCollider
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      struct CollideJob
+      {
+        CollideJob () {}
+        
+        CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0,
+                    NodeRef ref1, const BBox3fa& bounds1, size_t depth1)
+        : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {}
+        
+        NodeRef ref0;
+        BBox3fa bounds0;
+        size_t depth0;
+        NodeRef ref1;
+        BBox3fa bounds1;
+        size_t depth1;
+      };
+
+      typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector;
+
+      void split(const CollideJob& job, jobvector& jobs);
+      
+    public:
+      __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {}
+
+    public:
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0;
+      void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1);
+      void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1);
+    
+    protected:
+      Scene* scene0;
+      Scene* scene1;
+      RTCCollideFunc callback;
+      void* userPtr;
+    };
+
+    template<int N>
+      class BVHNColliderUserGeom : public BVHNCollider<N>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : BVHNCollider<N>(scene0,scene1,callback,userPtr) {}
+
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1);
+    public:
+      static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
new file mode 100644
index 0000000000..54021ca6eb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../common/isa.h"
+#include "../common/accel.h"
+#include "../common/scene.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  /*! BVH instantiations */
+  class BVHFactory
+  {
+  public:
+    enum class BuildVariant     { STATIC, DYNAMIC, HIGH_QUALITY };
+    enum class IntersectVariant { FAST, ROBUST };
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
new file mode 100644
index 0000000000..ea6adc2717
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
@@ -0,0 +1,330 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.h"
+#include "node_intersector1.h"
+#include "bvh_traverser1.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
+                                                                              RayHit& __restrict__ ray,
+                                                                              IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+      StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+      StackItemT<NodeRef>* stackEnd = stack+stackSize;
+      stack[0].ptr  = bvh->root;
+      stack[0].dist = neg_inf;
+      
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = NodeRef(stackPtr->ptr);
+
+        /* if popped node is too far, pop next one */
+#if defined(__AVX512ER__)
+        /* much faster on KNL */
+        if (unlikely(any(vfloat<Nx>(*(float*)&stackPtr->dist) > tray.tfar)))
+          continue;
+#else
+        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
+          continue;
+#endif
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<Nx> tNear;
+          STAT3(normal.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(normal.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node);
+        tray.tfar = ray.tfar;
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          stackPtr->ptr = lazy_node;
+          stackPtr->dist = neg_inf;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
+                                                                             Ray& __restrict__ ray,
+                                                                             IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+       
+      /* early out for already occluded rays */
+      if (unlikely(ray.tfar < 0.0f))
+        return;
+
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      NodeRef stack[stackSize];    // stack of nodes that still need to get traversed
+      NodeRef* stackPtr = stack+1; // current stack pointer
+      NodeRef* stackEnd = stack+stackSize;
+      stack[0] = bvh->root;
+
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = (NodeRef)*stackPtr;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<Nx> tNear;
+          STAT3(shadow.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(shadow.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) {
+          ray.tfar = neg_inf;
+          break;
+        }
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          *stackPtr = (NodeRef)lazy_node;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    struct PointQueryDispatch
+    {
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+      {
+        const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+        
+        /* we may traverse an empty BVH in case all geometry was invalid */
+        if (bvh->root == BVH::emptyNode)
+          return false;
+        
+        /* stack state */
+        StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+        StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+        StackItemT<NodeRef>* stackEnd = stack+stackSize;
+        stack[0].ptr  = bvh->root;
+        stack[0].dist = neg_inf;
+        
+        /* verify correct input */
+        assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f));
+
+        /* load the point query into SIMD registers */
+        TravPointQuery<N> tquery(query->p, context->query_radius);
+
+        /* initialize the node traverser */
+        BVHNNodeTraverser1Hit<N, N, types> nodeTraverser;
+
+        bool changed = false;
+        float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                          ? query->radius * query->radius
+                          : dot(context->query_radius, context->query_radius);
+
+        /* pop loop */
+        while (true) pop:
+        {
+          /* pop next node */
+          if (unlikely(stackPtr == stack)) break;
+          stackPtr--;
+          NodeRef cur = NodeRef(stackPtr->ptr);
+
+          /* if popped node is too far, pop next one */
+          if (unlikely(*(float*)&stackPtr->dist > cull_radius))
+            continue;
+
+          /* downtraversal loop */
+          while (true)
+          {
+            /* intersect node */
+            size_t mask; vfloat<N> tNear;
+            STAT3(point_query.trav_nodes,1,1,1);
+            bool nodeIntersected;
+            if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+              nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            } else {
+              nodeIntersected = BVHNNodePointQueryAABB1  <N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            }
+            if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; }
+
+            /* if no child is hit, pop next node */
+            if (unlikely(mask == 0))
+              goto pop;
+
+            /* select next child and push other children */
+            nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+          }
+
+          /* this is a leaf node */
+          assert(cur != BVH::emptyNode);
+          STAT3(point_query.trav_leaves,1,1,1);
+          size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+          size_t lazy_node = 0;
+          if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node))
+          {
+            changed = true;
+            tquery.rad = context->query_radius;
+            cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                        ? query->radius * query->radius
+                        : dot(context->query_radius, context->query_radius);
+          }
+
+          /* push lazy node onto stack */
+          if (unlikely(lazy_node)) {
+            stackPtr->ptr = lazy_node;
+            stackPtr->dist = neg_inf;
+            stackPtr++;
+          }
+        }
+        return changed;
+      }
+    };
+
+    /* disable point queries for not yet supported geometry types */
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery(
+      const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+    {
+      return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
new file mode 100644
index 0000000000..1a269c319a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH single ray intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    class BVHNIntersector1
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+    public:
+      static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
+      static void occluded  (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+      static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
new file mode 100644
index 0000000000..989f7354fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.cpp"
+
+namespace embree
+{
+  namespace isa
+  {
+    int getISA() {
+      return VerifyMultiTargetLinking::getISA();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// BVH4Intersector1 Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller,  BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller  <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>));
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>));
+    
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >));
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >));
+
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >));
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >));
+    //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
new file mode 100644
index 0000000000..d764cc928d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+#include "node_intersector_frustum.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true>
+    class BVHNIntersectorKHybrid
+    {
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = types == BVH_AN1 ? vextend<N>::size : N;
+
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersectorK::Precalculations Precalculations;
+      typedef typename PrimitiveIntersectorK::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+      static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth;
+
+      static const size_t switchThresholdIncoherent = \
+      (K==4)  ? 3 :
+      (K==8)  ? ((N==4) ? 5 : 7) :
+      (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal
+      0;
+
+    private:
+      static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+      static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                            RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+
+    public:
+      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+    };
+
+    /*! BVH packet intersector. */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK>
+    class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {};
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
new file mode 100644
index 0000000000..83d1fb4d3d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
@@ -0,0 +1,295 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector_packet_stream.h"
+#include "node_intersector_frustum.h"
+#include "bvh_traverser_stream.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! BVH ray stream intersector. */
+    template<int N, int Nx, int types, bool robust, typename PrimitiveIntersector>
+    class BVHNIntersectorStream
+    {
+      static const int Nxd = (Nx == N) ? N : Nx/2;
+
+      /* shortcuts for frequently used types */
+      template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
+      template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB AABBNodeMB;
+
+      template<int K>
+      __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
+                                                        TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
+      {
+        const size_t numPackets = (numOctantRays+K-1)/K;
+
+        Vec3vf<K> tmp_min_rdir(pos_inf);
+        Vec3vf<K> tmp_max_rdir(neg_inf);
+        Vec3vf<K> tmp_min_org(pos_inf);
+        Vec3vf<K> tmp_max_org(neg_inf);
+        vfloat<K> tmp_min_dist(pos_inf);
+        vfloat<K> tmp_max_dist(neg_inf);
+
+        size_t m_active = 0;
+        for (size_t i = 0; i < numPackets; i++)
+        {
+          const vfloat<K> tnear = inputPackets[i]->tnear();
+          const vfloat<K> tfar  = inputPackets[i]->tfar;
+          vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+          m_valid &= inputPackets[i]->valid();
+#endif
+
+          m_active |= (size_t)movemask(m_valid) << (i*K);
+
+          vfloat<K> packet_min_dist = max(tnear, 0.0f);
+          vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+          tmp_min_dist = min(tmp_min_dist, packet_min_dist);
+          tmp_max_dist = max(tmp_max_dist, packet_max_dist);
+
+          const Vec3vf<K>& org = inputPackets[i]->org;
+          const Vec3vf<K>& dir = inputPackets[i]->dir;
+
+          new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
+
+          tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
+          tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
+          tmp_min_org  = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
+          tmp_max_org  = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
+        }
+
+        m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
+
+        
+        const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
+                                      reduce_min(tmp_min_rdir.y),
+                                      reduce_min(tmp_min_rdir.z));
+
+        const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
+                                      reduce_max(tmp_max_rdir.y),
+                                      reduce_max(tmp_max_rdir.z));
+
+        const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
+                                        reduce_min(tmp_min_org.y),
+                                        reduce_min(tmp_min_org.z));
+
+        const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
+                                        reduce_max(tmp_max_org.y),
+                                        reduce_max(tmp_max_org.z));
+
+        commonOctant =
+          (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
+          (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
+          (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
+        
+        const float frustum_min_dist = reduce_min(tmp_min_dist);
+        const float frustum_max_dist = reduce_max(tmp_max_dist);
+
+        frustum.init(reduced_min_origin, reduced_max_origin,
+                     reduced_min_rdir, reduced_max_rdir,
+                     frustum_min_dist, frustum_max_dist,
+                     N);
+        
+        return m_active;
+      }
+
+      template<int K>
+      __forceinline static size_t intersectAABBNodePacket(size_t m_active,
+                                                             const TravRayKStream<K,robust>* packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             size_t boxID,
+                                                             const NearFarPrecalculations& nf)
+      {
+        assert(m_active);
+        const size_t startPacketID = bsf(m_active) / K;
+        const size_t endPacketID   = bsr(m_active) / K;
+        size_t m_trav_active = 0;
+        for (size_t i = startPacketID; i <= endPacketID; i++)
+        {
+          const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
+          m_trav_active |= m_hit << (i*K);
+        } 
+        return m_trav_active;
+      }
+      
+      template<int K>
+      __forceinline static size_t traverseCoherentStream(size_t m_active,
+                                                         TravRayKStream<K, robust>* packets,
+                                                         const AABBNode* __restrict__ node,
+                                                         const Frustum<robust>& frustum,
+                                                         size_t* maskK,
+                                                         vfloat<Nx>& dist)
+      {
+        size_t m_node_hit = intersectNodeFrustum<N,Nx>(node, frustum, dist);
+        const size_t first_index    = bsf(m_active);
+        const size_t first_packetID = first_index / K;
+        const size_t first_rayID    = first_index % K;
+        size_t m_first_hit = intersectNode1<N,Nx>(node, packets[first_packetID], first_rayID, frustum.nf);
+
+        /* this make traversal independent of the ordering of rays */
+        size_t m_node = m_node_hit ^ m_first_hit;
+        while (unlikely(m_node))
+        {
+          const size_t boxID = bscf(m_node);
+          const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
+          m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
+          maskK[boxID] = m_current;
+        }
+        return m_node_hit;
+      }
+      
+      // TODO: explicit 16-wide path for KNL
+      template<int K>
+      __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamFast<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<Nx> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<Nx> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+          const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+          const vfloat<Nx> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); 
+#else
+          const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
+          const vfloat<Nx> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<Nx> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<Nx> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+#endif
+
+          const vfloat<Nx> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+          const vfloat<Nx> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<Nx>(p.tfar[i]));      
+
+#if defined(__AVX512ER__)
+          const vboolx m_node((1 << N)-1);
+          const vbool<Nx> hit_mask = le(m_node, tNear, tFar);
+          vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+          const vbool<Nx> hit_mask = tNear <= tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+        } while(m_active);
+        return vmask;        
+      }
+
+      template<int K>
+      __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamRobust<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<Nx> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<Nx> bitmask(shiftTable[rayID]);
+          const vfloat<Nx> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<Nx> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<Nx> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<Nx> tFarX  = (bmaxX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<Nx> tFarY  = (bmaxY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<Nx> tFarZ  = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<Nx> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+          const vfloat<Nx> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<Nx>(p.tfar[i]));
+          const float round_down  = 1.0f-2.0f*float(ulp);
+          const float round_up    = 1.0f+2.0f*float(ulp);
+#if defined(__AVX512ER__)
+          const vboolx m_node((1 << N)-1);
+          const vbool<Nx> hit_mask = le(m_node, round_down*tNear, round_up*tFar);
+          vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+          const vbool<Nx> hit_mask = round_down*tNear <= round_up*tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+        } while(m_active);
+        return vmask;
+      }
+                                                         
+
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
+
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+
+
+    /*! BVH ray stream intersector with direct fallback to packets. */
+    template<int N, int Nx>
+    class BVHNIntersectorStreamPacketFallback
+    {
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
new file mode 100644
index 0000000000..cdeb923637
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/ray.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class RayStreamFilter
+    {
+    public:
+      static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
+      static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
+      static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
+
+      static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
+      static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
+      static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
+
+    private:
+      template<int K, bool intersect>
+      static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
+    };
+  }
+};
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
new file mode 100644
index 0000000000..baa4a8d805
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const {
+        node.getAABBNode()->setRef(i,child);
+        node.getAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear();
+        for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds());
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    struct Set3
+    {
+      Set3 (FastAllocator* allocator, PrimRef* prims)
+      : allocator(allocator), prims(prims) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        
+        if (unlikely(precord.alloc_barrier))
+        {
+          PrimRef* begin = &prims[precord.prims.begin()];
+          PrimRef* end   = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!!
+          size_t bytes = (size_t)end - (size_t)begin;
+          allocator->addBlock(begin,bytes);
+        }
+        
+        return ref;
+      }
+      
+      FastAllocator* const allocator;
+      PrimRef* const prims;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      lower_x = lower_y = lower_z = pos_inf;
+      upper_x = upper_y = upper_z = neg_inf;
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds)
+    {
+      assert(i < N);
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) {
+      setBounds(i,bounds);
+      children[i] = ref;
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z));
+      const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]);
+      const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]);
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extend(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    /*! Returns bounds of all children (implemented later as specializations) */
+    __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const;
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(upper_z[i],upper_z[j]);
+    }
+
+    /*! swap the children of two nodes */
+    __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(a->children[i],b->children[j]);
+      std::swap(a->lower_x[i],b->lower_x[j]);
+      std::swap(a->lower_y[i],b->lower_y[j]);
+      std::swap(a->lower_z[i],b->lower_z[j]);
+      std::swap(a->upper_x[i],b->upper_x[j]);
+      std::swap(a->upper_y[i],b->upper_y[j]);
+      std::swap(a->upper_z[i],b->upper_z[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNode_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n)
+    {
+      o << "AABBNode { " << embree_endl;
+      o << "  lower_x " << n.lower_x << embree_endl;
+      o << "  upper_x " << n.upper_x << embree_endl;
+      o << "  lower_y " << n.lower_y << embree_endl;
+      o << "  upper_y " << n.upper_y << embree_endl;
+      o << "  lower_z " << n.lower_z << embree_endl;
+      o << "  upper_z " << n.upper_z << embree_endl;
+      o << "  children = ";
+      for (size_t i=0; i<N; i++) o << n.children[i] << " ";
+      o << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  public:
+    vfloat<N> lower_x;           //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;           //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;           //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;           //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;           //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;           //!< Z dimension of upper bounds of all N children.
+  };
+
+  template<>
+    __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const {
+    transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower);
+    transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
new file mode 100644
index 0000000000..501f4bce5b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Motion Blur AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    { 
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,children[i].lbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+    };
+    
+    struct SetTimeRange
+    {
+      __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i, children[i].ref);
+          node->setBounds(i, children[i].lbounds, tbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+      
+      BBox1f tbounds;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_x = lower_y = lower_z = vfloat<N>(pos_inf);
+      upper_x = upper_y = upper_z = vfloat<N>(neg_inf);
+      lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f);
+      upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, NodeRef ref) {
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i)
+    {
+      /*! for empty bounds we have to avoid inf-inf=nan */
+      BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX)));
+      BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX)));
+      bounds0 = bounds0.enlarge_by(4.0f*float(ulp));
+      bounds1 = bounds1.enlarge_by(4.0f*float(ulp));
+      Vec3fa dlower = bounds1.lower-bounds0.lower;
+      Vec3fa dupper = bounds1.upper-bounds0.upper;
+      
+      lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z;
+      upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z;
+      
+      lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z;
+      upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds) {
+      setBounds(i, bounds.bounds0, bounds.bounds1);
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) {
+      setBounds(i, bounds.global(tbounds));
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) {
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child)
+    {
+      setRef(i, child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Return bounding box for time 0 */
+    __forceinline BBox3fa bounds0(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]),
+                     Vec3fa(upper_x[i],upper_y[i],upper_z[i]));
+    }
+    
+    /*! Return bounding box for time 1 */
+    __forceinline BBox3fa bounds1(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]),
+                     Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i]));
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)),
+                            reduce_min(min(lower_y,lower_y+lower_dy)),
+                            reduce_min(min(lower_z,lower_z+lower_dz))),
+                     Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)),
+                            reduce_max(max(upper_y,upper_y+upper_dy)),
+                            reduce_max(max(upper_z,upper_z+upper_dz))));
+    }
+    
+    /*! Return bounding box of child i */
+    __forceinline BBox3fa bounds(size_t i) const {
+      return merge(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return linear bounding box of child i */
+    __forceinline LBBox3fa lbounds(size_t i) const {
+      return LBBox3fa(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return bounding box of child i at specified time */
+    __forceinline BBox3fa bounds(size_t i, float time) const {
+      return lerp(bounds0(i),bounds1(i),time);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return lbounds(i).expectedHalfArea();
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const {
+      return lbounds(i).expectedHalfArea(t0t1); 
+    }
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_z[i],upper_z[j]);
+      
+      std::swap(lower_dx[i],lower_dx[j]);
+      std::swap(upper_dx[i],upper_dx[j]);
+      std::swap(lower_dy[i],lower_dy[j]);
+      std::swap(upper_dy[i],upper_dy[j]);
+      std::swap(lower_dz[i],lower_dz[j]);
+      std::swap(upper_dz[i],upper_dz[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNodeMB_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) 
+    {
+      cout << "AABBNodeMB {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << b0 << ", " << embree_endl;
+        cout << "    bounds1 = " << b1 << ", " << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_x;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;        //!< Z dimension of upper bounds of all N children.
+    
+    vfloat<N> lower_dx;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_dx;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_dy;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_dy;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_dz;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_dz;        //!< Z dimension of upper bounds of all N children.
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
new file mode 100644
index 0000000000..e968bbbc39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -0,0 +1,107 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_aabb_mb.h"
+
+namespace embree
+{
+  /*! Aligned 4D Motion Blur Node */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using AABBNodeMB_t<NodeRef,N>::set;
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const
+      {
+        if (hasTimeSplits)
+        {
+          AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+        else
+        {
+          AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+      }
+    };
+
+    struct Set
+    {
+      template<typename BuildRecord>
+      __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
+      {
+        if (likely(ref.isAABBNodeMB())) {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB()->set(i, children[i]);
+        } else {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB4D()->set(i, children[i]);
+        }
+      }
+    };
+
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_t = vfloat<N>(pos_inf);
+      upper_t = vfloat<N>(neg_inf);
+      AABBNodeMB_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds)
+    {
+      AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds));
+      lower_t[i] = tbounds.lower;
+      upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child) {
+      AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i));
+    }
+    
+    /*! returns time range for specified child */
+    __forceinline BBox1f timeRange(size_t i) const {
+      return BBox1f(lower_t[i],upper_t[i]);
+    }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) 
+    {
+      cout << "AABBNodeMB4D {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl;
+        cout << "    bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl;
+        cout << "    time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_t;        //!< time dimension of lower bounds of all N children
+    vfloat<N> upper_t;        //!< time dimension of upper bounds of all N children
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
new file mode 100644
index 0000000000..8268f3b932
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
@@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_ref.h"
+
+namespace embree
+{
+  
+  /*! BVHN Base Node */
+  template<typename NodeRef, int N>
+    struct BaseNode_t
+  {
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      for (size_t i=0; i<N; i++)
+        children[i] = NodeRef::emptyNode;
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! verifies the node */
+    __forceinline bool verify() const
+    {
+      for (size_t i=0; i<N; i++) {
+        if (child(i) == NodeRef::emptyNode) {
+          for (; i<N; i++) {
+            if (child(i) != NodeRef::emptyNode)
+              return false;
+          }
+          break;
+        }
+      }
+      return true;
+    }
+    
+    NodeRef children[N];    //!< Pointer to the N children (can be a node or leaf)
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
new file mode 100644
index 0000000000..fa7cc08211
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Node with unaligned bounds */
+  template<typename NodeRef, int N>
+    struct OBBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const {
+        node.ungetAABBNode()->setRef(i,child);
+        node.ungetAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      naabb.l.vx = Vec3fa(nan);
+      naabb.l.vy = Vec3fa(nan);
+      naabb.l.vz = Vec3fa(nan);
+      naabb.p    = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box. */
+    __forceinline void setBounds(size_t i, const OBBox3fa& b)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = b.space;
+      space.p -= b.bounds.lower;
+      space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space;
+      
+      naabb.l.vx.x[i] = space.l.vx.x;
+      naabb.l.vx.y[i] = space.l.vx.y;
+      naabb.l.vx.z[i] = space.l.vx.z;
+      
+      naabb.l.vy.x[i] = space.l.vy.x;
+      naabb.l.vy.y[i] = space.l.vy.y;
+      naabb.l.vy.z[i] = space.l.vy.z;
+      
+      naabb.l.vz.x[i] = space.l.vz.x;
+      naabb.l.vz.y[i] = space.l.vz.y;
+      naabb.l.vz.z[i] = space.l.vz.z;
+      
+      naabb.p.x[i] = space.p.x;
+      naabb.p.y[i] = space.p.y;
+      naabb.p.z[i] = space.p.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent(size_t i) const {
+      assert(i<N);
+      const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]);
+      const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]);
+      const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n)
+    {
+      o << "UnAABBNode { " << n.naabb << " } " << embree_endl;
+      return o;
+    }
+    
+  public:
+    AffineSpace3vf<N> naabb;   //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space)
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
new file mode 100644
index 0000000000..834cf5ec28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
@@ -0,0 +1,90 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  template<typename NodeRef, int N>
+    struct OBBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const {
+        node.ungetAABBNodeMB()->setRef(i,child);
+        node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt));
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      space0 = one;
+      //b0.lower = b0.upper = Vec3fa(nan);
+      b1.lower = b1.upper = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) {
+      setBounds(i,space,lbounds.bounds0,lbounds.bounds1);
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = s0;
+      space.p -= a.lower;
+      Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower);
+      space = AffineSpace3fa::scale(scale)*space;
+      BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale);
+      BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale);
+      
+      space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z;
+      space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z;
+      space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z;
+      space0.p   .x[i] = space.p   .x; space0.p   .y[i] = space.p   .y; space0.p   .z[i] = space.p   .z;
+      
+      /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z;
+        b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/
+      
+      b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z;
+      b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent0(size_t i) const {
+      assert(i < N);
+      const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]);
+      const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]);
+      const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+  public:
+    AffineSpace3vf<N> space0;
+    //BBox3vf<N> b0; // these are the unit bounds
+    BBox3vf<N> b1;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
new file mode 100644
index 0000000000..5212821f3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
@@ -0,0 +1,265 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNode_t
+  {
+    typedef unsigned char T;
+    static const T MIN_QUAN = 0;
+    static const T MAX_QUAN = 255;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN;
+      for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN;
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x),
+                         madd(scale.y,(float)lower_y[i],start.y),
+                         madd(scale.z,(float)lower_z[i],start.z));
+      const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x),
+                         madd(scale.y,(float)upper_y[i],start.y),
+                         madd(scale.z,(float)upper_z[i],start.z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    static __forceinline void init_dim(const vfloat<N> &lower,
+                                       const vfloat<N> &upper,
+                                       T lower_quant[N],
+                                       T upper_quant[N],
+                                       float &start,
+                                       float &scale)
+    {
+      /* quantize bounds */
+      const vbool<N> m_valid = lower != vfloat<N>(pos_inf);
+      const float minF = reduce_min(lower);
+      const float maxF = reduce_max(upper);
+      float diff = (1.0f+2.0f*float(ulp))*(maxF - minF);
+      float decode_scale = diff / float(MAX_QUAN);
+      if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
+      assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
+      const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
+      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
+      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+      
+      /* lower/upper correction */
+      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
+      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+      ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
+      iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
+      
+      /* disable invalid lanes */
+      ilower = select(m_valid,ilower,MAX_QUAN);
+      iupper = select(m_valid,iupper,MIN_QUAN);
+      
+      /* store as uchar to memory */
+      vint<N>::store(lower_quant,ilower);
+      vint<N>::store(upper_quant,iupper);
+      start = minF;
+      scale = decode_scale;
+      
+#if defined(DEBUG)
+      vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
+      vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+      vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
+      vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
+      assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
+      assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid));
+#endif
+    }
+    
+    __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node)
+    {
+      init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x);
+      init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y);
+      init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z);
+    }
+    
+    __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); }
+    
+#if defined(__AVX512F__) // KNL
+    __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
+#endif
+    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+    
+    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+    
+    template <int M>
+      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+    
+#if defined(__AVX512F__)
+    __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
+    __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); }
+    __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }      
+#endif
+    
+    union {
+      struct {
+        T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children
+        T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children
+        T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children
+        T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children
+        T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children
+        T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children
+      };
+      T all_planes[6*N];
+    };
+    
+    Vec3f start;
+    Vec3f scale;
+    
+    friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n)
+    {
+      o << "QuantizedBaseNode { " << embree_endl;
+      o << "  start   " << n.start << embree_endl;
+      o << "  scale   " << n.scale << embree_endl;
+      o << "  lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl;
+      o << "  upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl;
+      o << "  lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl;
+      o << "  upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl;
+      o << "  lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl;
+      o << "  upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  };
+
+  template<typename NodeRef, int N>
+    struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using QuantizedBaseNode_t<N>::lower_x;
+    using QuantizedBaseNode_t<N>::upper_x;
+    using QuantizedBaseNode_t<N>::lower_y;
+    using QuantizedBaseNode_t<N>::upper_y;
+    using QuantizedBaseNode_t<N>::lower_z;
+    using QuantizedBaseNode_t<N>::upper_z;
+    using QuantizedBaseNode_t<N>::start;
+    using QuantizedBaseNode_t<N>::scale;
+    using QuantizedBaseNode_t<N>::init_dim;
+    
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const
+      {
+        __aligned(64) AABBNode_t<NodeRef,N> node;
+        node.clear();
+        for (size_t i=0; i<n; i++) {
+          node.setBounds(i,children[i].bounds());
+        }
+        QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment);
+        qnode->init(node);
+        
+        return (size_t)qnode | NodeRef::tyQuantizedNode;
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        QuantizedNode_t* node = ref.quantizedNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    __forceinline void init(AABBNode_t<NodeRef,N>& node)
+    {
+      for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode;
+      init_dim(node);
+    }
+    
+  }; 
+  
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNodeMB_t
+  {
+    QuantizedBaseNode_t<N> node0;
+    QuantizedBaseNode_t<N> node1;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      node0.clear();
+      node1.clear();
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      BBox3fa bounds0 = node0.bounds(i);
+      BBox3fa bounds1 = node1.bounds(i);
+      bounds0.extend(bounds1);
+      return bounds0;
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    __forceinline vbool<N> validMask() const { return node0.validMask(); }
+    
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); }
+    
+    
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); }
+    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
new file mode 100644
index 0000000000..0f6d4dac7e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/alloc.h"
+#include "../common/accel.h"
+#include "../common/device.h"
+#include "../common/scene.h"
+#include "../geometry/primitive.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  /* BVH node reference with bounds */
+  template<typename NodeRef>
+  struct BVHNodeRecord
+  {
+    __forceinline BVHNodeRecord() {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {}
+
+    NodeRef ref;
+    BBox3fx bounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB
+  {
+    __forceinline BVHNodeRecordMB() {}
+    __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB4D
+  {
+    __forceinline BVHNodeRecordMB4D() {}
+    __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+    BBox1f dt;
+  };
+
+  template<typename NodeRef, int N> struct BaseNode_t;
+  template<typename NodeRef, int N> struct AABBNode_t;
+  template<typename NodeRef, int N> struct AABBNodeMB_t;
+  template<typename NodeRef, int N> struct AABBNodeMB4D_t;
+  template<typename NodeRef, int N> struct OBBNode_t;
+  template<typename NodeRef, int N> struct OBBNodeMB_t;
+  template<typename NodeRef, int N> struct QuantizedNode_t;
+  template<typename NodeRef, int N> struct QuantizedNodeMB_t;
+  
+  /*! Pointer that points to a node or a list of primitives */
+  template<int N>
+    struct NodeRefPtr
+  {
+    //template<int NN> friend class BVHN;
+
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+
+    /*! highest address bit is used as barrier for some algorithms */
+    static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1));
+
+    /*! Masks the bits that store the number of items per leaf. */
+    static const size_t align_mask = byteAlignment-1;
+    static const size_t items_mask = byteAlignment-1;
+
+    /*! different supported node types */
+    static const size_t tyAABBNode = 0;
+    static const size_t tyAABBNodeMB = 1;
+    static const size_t tyAABBNodeMB4D = 6;
+    static const size_t tyOBBNode = 2;
+    static const size_t tyOBBNodeMB = 3;
+    static const size_t tyQuantizedNode = 5;
+    static const size_t tyLeaf = 8;
+
+    /*! Empty node */
+    static const size_t emptyNode = tyLeaf;
+
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0);
+    static const size_t popRay      = (((size_t)-1) & (~items_mask)) | (tyLeaf+1);
+
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = items_mask-tyLeaf;
+        
+    /*! Default constructor */
+    __forceinline NodeRefPtr () {}
+    
+    /*! Construction from integer */
+    __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {}
+    
+    /*! Cast to size_t */
+    __forceinline operator size_t() const { return ptr; }
+    
+    /*! Sets the barrier bit. */
+    __forceinline void setBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+      assert(!isBarrier());
+      ptr |= barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Clears the barrier bit. */
+    __forceinline void clearBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+      ptr &= ~barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */
+    __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; }
+    
+    /*! checks if this is a leaf */
+    __forceinline size_t isLeaf() const { return ptr & tyLeaf; }
+    
+    /*! returns node type */
+    __forceinline int type() const { return ptr & (size_t)align_mask; }
+    
+    /*! checks if this is a node */
+    __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; }
+    
+    /*! checks if this is a motion blur node */
+    __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; }
+    
+    /*! checks if this is a 4D motion blur node */
+    __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; }
+    
+    /*! checks if this is a node with unaligned bounding boxes */
+    __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; }
+    
+    /*! checks if this is a motion blur node with unaligned bounding boxes */
+    __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; }
+    
+    /*! checks if this is a quantized node */
+    __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; }
+
+    /*! Encodes a node */
+    static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB4D);
+    }
+
+    /*! Encodes an unaligned node */
+    static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNode);
+    }
+
+    /*! Encodes an unaligned motion blur node */
+    static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNodeMB);
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) {
+      assert(!((size_t)tri & align_mask));
+      assert(num <= maxLeafBlocks);
+      return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks)));
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) {
+      assert(!((size_t)ptr & align_mask));
+      return NodeRefPtr((size_t)ptr | (tyLeaf+ty));
+    }
+    
+    /*! returns base node pointer */
+    __forceinline BaseNode_t<NodeRefPtr,N>* baseNode()
+    {
+      assert(!isLeaf());
+      return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const
+    {
+      assert(!isLeaf());
+      return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! returns node pointer */
+    __forceinline       AABBNode_t<NodeRefPtr,N>* getAABBNode()       { assert(isAABBNode()); return (      AABBNode_t<NodeRefPtr,N>*)ptr; }
+    __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; }
+    
+    /*! returns motion blur node pointer */
+    __forceinline       AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB()       { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (      AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns 4D motion blur node pointer */
+    __forceinline       AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D()       { assert(isAABBNodeMB4D()); return (      AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned node pointer */
+    __forceinline       OBBNode_t<NodeRefPtr,N>* ungetAABBNode()       { assert(isOBBNode()); return (      OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned motion blur node pointer */
+    __forceinline       OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB()       { assert(isOBBNodeMB()); return (      OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns quantized node pointer */
+    __forceinline       QuantizedNode_t<NodeRefPtr,N>* quantizedNode()       { assert(isQuantizedNode()); return (      QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    
+    /*! returns leaf pointer */
+    __forceinline char* leaf(size_t& num) const {
+      assert(isLeaf());
+      num = (ptr & (size_t)items_mask)-tyLeaf;
+      return (char*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! clear all bit flags */
+    __forceinline void clearFlags() {
+      ptr &= ~(size_t)align_mask;
+    }
+    
+     /*! returns the wideness */
+    __forceinline size_t getN() const { return N; }
+    
+  public:
+    size_t ptr;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
new file mode 100644
index 0000000000..a273c21e8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_refit.h"
+#include "bvh_statistics.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t SINGLE_THREAD_THRESHOLD = 4*1024;
+    
+    template<int N>
+    __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b)
+    {
+      size_t sa = *(size_t*)&a->node()->lower_x;
+      size_t sb = *(size_t*)&b->node()->lower_x;
+      return sa < sb;
+    }
+
+    template<int N>
+    BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds)
+      : bvh(bvh), leafBounds(leafBounds), numSubTrees(0)
+    {
+    }
+
+    template<int N>
+    void BVHNRefitter<N>::refit()
+    {
+      if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) {
+        bvh->bounds = LBBox3fa(recurse_bottom(bvh->root));
+      }
+      else
+      {
+        BBox3fa subTreeBounds[MAX_NUM_SUB_TREES];
+        numSubTrees = 0;
+        gather_subtree_refs(bvh->root,numSubTrees,0);
+        if (numSubTrees)
+          parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) {
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                NodeRef& ref = subTrees[i];
+                subTreeBounds[i] = recurse_bottom(ref);
+              }
+            });
+
+        numSubTrees = 0;        
+        bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0));
+      }    
+  }
+
+    template<int N>
+    void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref,
+                                              size_t &subtrees,
+                                              const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        subTrees[subtrees++] = ref;
+        return;
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          NodeRef& child = node->child(i);
+          if (unlikely(child == BVH::emptyNode)) continue;
+          gather_subtree_refs(child,subtrees,depth+1); 
+        }
+      }
+    }
+
+    template<int N>
+    BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
+                                            size_t &subtrees,
+											const BBox3fa *const subTreeBounds,
+                                            const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        assert(subTrees[subtrees] == ref);
+        return subTreeBounds[subtrees++];
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        BBox3fa bounds[N];
+
+        for (size_t i=0; i<N; i++)
+        {
+          NodeRef& child = node->child(i);
+
+          if (unlikely(child == BVH::emptyNode)) 
+            bounds[i] = BBox3fa(empty);
+          else
+            bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); 
+        }
+        
+        BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+        /* set new bounds */
+        node->lower_x = boundsT.lower.x;
+        node->lower_y = boundsT.lower.y;
+        node->lower_z = boundsT.lower.z;
+        node->upper_x = boundsT.upper.x;
+        node->upper_y = boundsT.upper.y;
+        node->upper_z = boundsT.upper.z;
+        
+        return merge<N>(bounds);
+      }
+      else
+        return leafBounds.leafBounds(ref);
+    }
+
+    // =========================================================
+    // =========================================================
+    // =========================================================
+
+    
+    template<int N>
+    BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref)
+    {
+      /* this is a leaf node */
+      if (unlikely(ref.isLeaf()))
+        return leafBounds.leafBounds(ref);
+      
+      /* recurse if this is an internal node */
+      AABBNode* node = ref.getAABBNode();
+
+      /* enable exclusive prefetch for >= AVX platforms */      
+#if defined(__AVX__)      
+      BVH::prefetchW(ref);
+#endif      
+      BBox3fa bounds[N];
+
+      for (size_t i=0; i<N; i++)
+        if (unlikely(node->child(i) == BVH::emptyNode))
+        {
+          bounds[i] = BBox3fa(empty);          
+        }
+      else
+        bounds[i] = recurse_bottom(node->child(i));
+      
+      /* AOS to SOA transform */
+      BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+      /* set new bounds */
+      node->lower_x = boundsT.lower.x;
+      node->lower_y = boundsT.lower.y;
+      node->lower_z = boundsT.lower.z;
+      node->upper_x = boundsT.upper.x;
+      node->upper_y = boundsT.upper.y;
+      node->upper_z = boundsT.upper.z;
+
+      return merge<N>(bounds);
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode)
+      : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {}
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::clear()
+    {
+      if (builder) 
+        builder->clear();
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::build()
+    {
+      if (mesh->topologyChanged(topologyVersion)) {
+        topologyVersion = mesh->getTopologyVersion();
+        builder->build();
+      }
+      else
+        refitter->refit();
+    }
+
+    template class BVHNRefitter<4>;
+#if defined(__AVX__)
+    template class BVHNRefitter<8>;
+#endif
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH4Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#if  defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH8Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
new file mode 100644
index 0000000000..4aa9bdd7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
@@ -0,0 +1,95 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    class BVHNRefitter
+    {
+    public:
+
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      struct LeafBoundsInterface {
+        virtual const BBox3fa leafBounds(NodeRef& ref) const = 0;
+      };
+
+    public:
+    
+      /*! Constructor. */
+      BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds);
+
+      /*! refits the BVH */
+      void refit();
+
+    private:
+      /* single-threaded subtree extraction based on BVH depth */
+      void gather_subtree_refs(NodeRef& ref, 
+                               size_t &subtrees,
+                               const size_t depth = 0);
+
+      /* single-threaded top-level refit */
+      BBox3fa refit_toplevel(NodeRef& ref,
+                             size_t &subtrees,
+							 const BBox3fa *const subTreeBounds,
+                             const size_t depth = 0);
+
+      /* single-threaded subtree refit */
+      BBox3fa recurse_bottom(NodeRef& ref);
+      
+    public:
+      BVH* bvh;                              //!< BVH to refit
+      const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves
+
+      static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4   : (N==8) ? 3    : 3;
+      static const size_t MAX_NUM_SUB_TREES             = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH
+      size_t numSubTrees;
+      NodeRef subTrees[MAX_NUM_SUB_TREES];
+    };
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface
+    {
+    public:
+      
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      
+    public:
+      BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode);
+
+      virtual void build();
+      
+      virtual void clear();
+
+      virtual const BBox3fa leafBounds (NodeRef& ref) const
+      {
+        size_t num; char* prim = ref.leaf(num);
+        if (unlikely(ref == BVH::emptyNode)) return empty;
+
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++)
+            bounds.extend(((Primitive*)prim)[i].update(mesh));
+        return bounds;
+      }
+      
+    private:
+      BVH* bvh;
+      std::unique_ptr<Builder> builder;
+      std::unique_ptr<BVHNRefitter<N>> refitter;
+      Mesh* mesh;
+      unsigned int topologyVersion;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
new file mode 100644
index 0000000000..2bb431bf0e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_rotate.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! Computes half surface area of box. */
+    __forceinline float halfArea3f(const BBox<vfloat4>& box) {
+      const vfloat4 d = box.size();
+      const vfloat4 a = d*shuffle<1,2,0,3>(d);
+      return a[0]+a[1]+a[2];
+    }
+    
+    size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
+    {
+      /*! nothing to rotate if we reached a leaf node. */
+      if (parentRef.isBarrier()) return 0;
+      if (parentRef.isLeaf()) return 0;
+      AABBNode* parent = parentRef.getAABBNode();
+      
+      /*! rotate all children first */
+      vint4 cdepth;
+      for (size_t c=0; c<4; c++)
+	cdepth[c] = (int)rotate(parent->child(c),depth+1);
+      
+      /* compute current areas of all children */
+      vfloat4 sizeX = parent->upper_x-parent->lower_x;
+      vfloat4 sizeY = parent->upper_y-parent->lower_y;
+      vfloat4 sizeZ = parent->upper_z-parent->lower_z;
+      vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ);
+      
+      /*! get node bounds */
+      BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
+      parent->bounds(child1_0,child1_1,child1_2,child1_3);
+      
+      /*! Find best rotation. We pick a first child (child1) and a sub-child 
+	(child2child) of a different second child (child2), and swap child1 
+	and child2child. We perform the best such swap. */
+      float bestArea = 0;
+      size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
+      for (size_t c2=0; c2<4; c2++)
+      {
+	/*! ignore leaf nodes as we cannot descent into them */
+	if (parent->child(c2).isBarrier()) continue;
+	if (parent->child(c2).isLeaf()) continue;
+	AABBNode* child2 = parent->child(c2).getAABBNode();
+	
+	/*! transpose child bounds */
+	BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
+	child2->bounds(child2c0,child2c1,child2c2,child2c3);
+	
+	/*! put child1_0 at each child2 position */
+	float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
+	float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
+	float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
+	float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
+	vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
+	vfloat4 min0 = vreduce_min(cost0);
+	int pos0 = (int)bsf(movemask(min0 == cost0));
+	
+	/*! put child1_1 at each child2 position */
+	float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
+	float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
+	float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
+	float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
+	vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
+	vfloat4 min1 = vreduce_min(cost1);
+	int pos1 = (int)bsf(movemask(min1 == cost1));
+	
+	/*! put child1_2 at each child2 position */
+	float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
+	float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
+	float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
+	float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
+	vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
+	vfloat4 min2 = vreduce_min(cost2);
+	int pos2 = (int)bsf(movemask(min2 == cost2));
+	
+	/*! put child1_3 at each child2 position */
+	float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
+	float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
+	float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
+	float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
+	vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
+	vfloat4 min3 = vreduce_min(cost3);
+	int pos3 = (int)bsf(movemask(min3 == cost3));
+	
+	/*! find best other child */
+	vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
+	int pos[4] = { pos0,pos1,pos2,pos3 };
+	const size_t mbd = BVH4::maxBuildDepth;
+	vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
+	valid &= vint4(int(c2)) != vint4(step);
+	if (none(valid)) continue;
+	size_t c1 = select_min(valid,area0123);
+	float area = area0123[c1]; 
+        if (c1 == c2) continue; // can happen if bounds are NANs
+	
+	/*! accept a swap when it reduces cost and is not swapping a node with itself */
+	if (area < bestArea) {
+	  bestArea = area;
+	  bestChild1 = c1;
+	  bestChild2 = c2;
+	  bestChild2Child = pos[c1];
+	}
+      }
+      
+      /*! if we did not find a swap that improves the SAH then do nothing */
+      if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
+      
+      /*! perform the best found tree rotation */
+      AABBNode* child2 = parent->child(bestChild2).getAABBNode();
+      AABBNode::swap(parent,bestChild1,child2,bestChild2Child);
+      parent->setBounds(bestChild2,child2->bounds());
+      AABBNode::compact(parent);
+      AABBNode::compact(child2);
+      
+      /*! This returned depth is conservative as the child that was
+       *  pulled up in the tree could have been on the critical path. */
+      cdepth[bestChild1]++; // bestChild1 was pushed down one level
+      return 1+reduce_max(cdepth); 
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
new file mode 100644
index 0000000000..009bef339e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa 
+  { 
+    template<int N>
+    class BVHNRotate
+    {
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+    public:
+      static const bool enabled = false;
+
+      static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; }
+      static __forceinline void restructure(NodeRef ref, size_t depth = 1) {}
+    };
+
+    /* BVH4 tree rotations */
+    template<>
+    class BVHNRotate<4>
+    {
+      typedef BVH4::AABBNode AABBNode;
+      typedef BVH4::NodeRef NodeRef;
+      
+    public:
+      static const bool enabled = true;
+
+      static size_t rotate(NodeRef parentRef, size_t depth = 1);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
new file mode 100644
index 0000000000..aa56035026
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
@@ -0,0 +1,168 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_statistics.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  template<int N>
+  BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh)
+  {
+    double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea());
+    stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f));
+  }
+  
+  template<int N>
+  std::string BVHNStatistics<N>::str()
+  {
+    std::ostringstream stream;
+    stream.setf(std::ios::fixed, std::ios::floatfield);
+    stream << "  primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl;
+    size_t totalBytes = stat.bytes(bvh);
+    double totalSAH = stat.sah(bvh);
+    stream << "  total            : sah = "  << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), ";
+    stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), ";
+    stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), ";
+    stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl;
+    if (stat.statAABBNodes.numNodes    ) stream << "  getAABBNodes     : "  << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodes.numNodes  ) stream << "  ungetAABBNodes   : "  << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB.numNodes  ) stream << "  getAABBNodesMB   : "  << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB4D.numNodes) stream << "  getAABBNodesMB4D : "  << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodesMB.numNodes) stream << "  ungetAABBNodesMB : "  << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statQuantizedNodes.numNodes  ) stream << "  quantizedNodes   : "  << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "  leaves           : "  << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "    histogram      : "  << stat.statLeaf.histToString() << std::endl;
+    return stream.str();
+  }
+  
+  template<int N>
+  typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1)
+  {
+    Statistics s;
+    assert(t0t1.size() > 0.0f);
+    double dt = max(0.0f,t0t1.size());
+    if (node.isAABBNode())
+    {
+      AABBNode* n = node.getAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extend(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statAABBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodes.numNodes++;
+      s.statAABBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNode())
+    {
+      OBBNode* n = node.ungetAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodes.numNodes++;
+      s.statOBBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB())
+    {
+      AABBNodeMB* n = node.getAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1));
+          Statistics s = statistics(n->child(i),Ai,t0t1);
+          s.statAABBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB.numNodes++;
+      s.statAABBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB4D())
+    {
+      AABBNodeMB4D* n = node.getAABBNodeMB4D();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const BBox1f t0t1i = intersect(t0t1,n->timeRange(i));
+          assert(!t0t1i.empty());
+          const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i);
+          Statistics s =  statistics(n->child(i),Ai,t0t1i);
+          s.statAABBNodesMB4D.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB4D.numNodes++;
+      s.statAABBNodesMB4D.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNodeMB())
+    {
+      OBBNodeMB* n = node.ungetAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent0(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodesMB.numNodes++;
+      s.statOBBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isQuantizedNode())
+    {
+      QuantizedNode* n = node.quantizedNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statQuantizedNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statQuantizedNodes.numNodes++;
+      s.statQuantizedNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isLeaf())
+    {
+      size_t num; const char* tri = node.leaf(num);
+      if (num)
+      {
+        for (size_t i=0; i<num; i++)
+        {
+          const size_t bytes = bvh->primTy->getBytes(tri);
+          s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri);
+          s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri);
+          s.statLeaf.numBytes += bytes;
+          tri+=bytes;
+        }
+        s.statLeaf.numLeaves++;
+        s.statLeaf.numPrimBlocks += num;
+        s.statLeaf.leafSAH += dt*A*num;
+        if (num-1 < Statistics::LeafStat::NHIST) {
+          s.statLeaf.numPrimBlocksHistogram[num-1]++;
+        }
+      }
+    }
+    else {
+      // -- GODOT start --
+      // throw std::runtime_error("not supported node type in bvh_statistics");
+      abort();
+      // -- GODOT end --
+    }
+    return s;
+  } 
+
+#if defined(__AVX__)
+  template class BVHNStatistics<8>;
+#endif
+
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
+  template class BVHNStatistics<4>;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
new file mode 100644
index 0000000000..73dfc6fbcc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
@@ -0,0 +1,285 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include <sstream>
+
+namespace embree
+{
+  template<int N>
+  class BVHNStatistics
+  {
+    typedef BVHN<N> BVH;
+    typedef typename BVH::AABBNode AABBNode;
+    typedef typename BVH::OBBNode OBBNode;
+    typedef typename BVH::AABBNodeMB AABBNodeMB;
+    typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+    typedef typename BVH::OBBNodeMB OBBNodeMB;
+    typedef typename BVH::QuantizedNode QuantizedNode;
+
+    typedef typename BVH::NodeRef NodeRef;
+
+    struct Statistics 
+    {
+      template<typename Node>
+        struct NodeStat
+      {
+        NodeStat ( double nodeSAH = 0,
+                   size_t numNodes = 0, 
+                   size_t numChildren = 0)
+        : nodeSAH(nodeSAH),
+          numNodes(numNodes), 
+          numChildren(numChildren) {}
+        
+        double sah(BVH* bvh) const {
+          return nodeSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes() const {
+          return numNodes*sizeof(Node);
+        }
+
+        size_t size() const {
+          return numNodes;
+        }
+
+        double fillRateNom () const { return double(numChildren);  }
+        double fillRateDen () const { return double(numNodes*N);  }
+        double fillRate    () const { return fillRateNom()/fillRateDen(); }
+
+        __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+        {
+          return NodeStat(a.nodeSAH + b.nodeSAH,
+                          a.numNodes+b.numNodes,
+                          a.numChildren+b.numChildren);
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";          
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+      public:
+        double nodeSAH;
+        size_t numNodes;
+        size_t numChildren;
+      };
+
+      struct LeafStat
+      {
+        static const int NHIST = 8;
+
+        LeafStat ( double leafSAH = 0.0f, 
+                   size_t numLeaves = 0,
+                   size_t numPrimsActive = 0,
+                   size_t numPrimsTotal = 0,
+                   size_t numPrimBlocks = 0,
+                   size_t numBytes = 0)
+        : leafSAH(leafSAH),
+          numLeaves(numLeaves),
+          numPrimsActive(numPrimsActive),
+          numPrimsTotal(numPrimsTotal),
+          numPrimBlocks(numPrimBlocks),
+          numBytes(numBytes)
+        {
+          for (size_t i=0; i<NHIST; i++)
+            numPrimBlocksHistogram[i] = 0;
+        }
+
+        double sah(BVH* bvh) const {
+          return leafSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes(BVH* bvh) const {
+          return numBytes;
+        }
+
+        size_t size() const {
+          return numLeaves;
+        }
+
+        double fillRateNom (BVH* bvh) const { return double(numPrimsActive);  }
+        double fillRateDen (BVH* bvh) const { return double(numPrimsTotal);  }
+        double fillRate    (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); }
+
+        __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+        {
+          LeafStat stat(a.leafSAH + b.leafSAH,
+                        a.numLeaves+b.numLeaves,
+                        a.numPrimsActive+b.numPrimsActive,
+                        a.numPrimsTotal+b.numPrimsTotal,
+                        a.numPrimBlocks+b.numPrimBlocks,
+                        a.numBytes+b.numBytes);
+          for (size_t i=0; i<NHIST; i++) {
+            stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i];
+            stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i];
+          }
+          return stat;
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+        std::string histToString() const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          for (size_t i=0; i<NHIST; i++)
+            stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% ";
+          return stream.str();
+        }
+     
+      public:
+        double leafSAH;                    //!< SAH of the leaves only
+        size_t numLeaves;                  //!< Number of leaf nodes.
+        size_t numPrimsActive;             //!< Number of active primitives (
+        size_t numPrimsTotal;              //!< Number of active and inactive primitives
+        size_t numPrimBlocks;              //!< Number of primitive blocks.
+        size_t numBytes;                   //!< Number of bytes of leaves.
+        size_t numPrimBlocksHistogram[8];
+      };
+
+    public:
+      Statistics (size_t depth = 0,
+                  LeafStat statLeaf = LeafStat(),
+                  NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(),
+                  NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(),
+                  NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(),
+                  NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(),
+                  NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(),
+                  NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>())
+
+      : depth(depth), 
+        statLeaf(statLeaf),
+        statAABBNodes(statAABBNodes),
+        statOBBNodes(statOBBNodes),
+        statAABBNodesMB(statAABBNodesMB),
+        statAABBNodesMB4D(statAABBNodesMB4D),
+        statOBBNodesMB(statOBBNodesMB),
+        statQuantizedNodes(statQuantizedNodes) {}
+
+      double sah(BVH* bvh) const 
+      {
+        return statLeaf.sah(bvh) +
+          statAABBNodes.sah(bvh) + 
+          statOBBNodes.sah(bvh) + 
+          statAABBNodesMB.sah(bvh) + 
+          statAABBNodesMB4D.sah(bvh) + 
+          statOBBNodesMB.sah(bvh) + 
+          statQuantizedNodes.sah(bvh);
+      }
+      
+      size_t bytes(BVH* bvh) const {
+        return statLeaf.bytes(bvh) +
+          statAABBNodes.bytes() + 
+          statOBBNodes.bytes() + 
+          statAABBNodesMB.bytes() + 
+          statAABBNodesMB4D.bytes() + 
+          statOBBNodesMB.bytes() + 
+          statQuantizedNodes.bytes();
+      }
+
+      size_t size() const 
+      {
+        return statLeaf.size() +
+          statAABBNodes.size() + 
+          statOBBNodes.size() + 
+          statAABBNodesMB.size() + 
+          statAABBNodesMB4D.size() + 
+          statOBBNodesMB.size() + 
+          statQuantizedNodes.size();
+      }
+
+      double fillRate (BVH* bvh) const 
+      {
+        double nom = statLeaf.fillRateNom(bvh) +
+          statAABBNodes.fillRateNom() + 
+          statOBBNodes.fillRateNom() + 
+          statAABBNodesMB.fillRateNom() + 
+          statAABBNodesMB4D.fillRateNom() + 
+          statOBBNodesMB.fillRateNom() + 
+          statQuantizedNodes.fillRateNom();
+        double den = statLeaf.fillRateDen(bvh) +
+          statAABBNodes.fillRateDen() + 
+          statOBBNodes.fillRateDen() + 
+          statAABBNodesMB.fillRateDen() + 
+          statAABBNodesMB4D.fillRateDen() + 
+          statOBBNodesMB.fillRateDen() + 
+          statQuantizedNodes.fillRateDen();
+        return nom/den;
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b )
+      {
+        return Statistics(max(a.depth,b.depth),
+                          a.statLeaf + b.statLeaf,
+                          a.statAABBNodes + b.statAABBNodes,
+                          a.statOBBNodes + b.statOBBNodes,
+                          a.statAABBNodesMB + b.statAABBNodesMB,
+                          a.statAABBNodesMB4D + b.statAABBNodesMB4D,
+                          a.statOBBNodesMB + b.statOBBNodesMB,
+                          a.statQuantizedNodes + b.statQuantizedNodes);
+      }
+
+      static Statistics add ( const Statistics& a, const Statistics& b ) {
+        return a+b;
+      }
+
+    public:
+      size_t depth;
+      LeafStat statLeaf;
+      NodeStat<AABBNode> statAABBNodes;
+      NodeStat<OBBNode> statOBBNodes;
+      NodeStat<AABBNodeMB> statAABBNodesMB;
+      NodeStat<AABBNodeMB4D> statAABBNodesMB4D;
+      NodeStat<OBBNodeMB> statOBBNodesMB;
+      NodeStat<QuantizedNode> statQuantizedNodes;
+    };
+
+  public:
+
+    /* Constructor gathers statistics. */
+    BVHNStatistics (BVH* bvh);
+
+    /*! Convert statistics into a string */
+    std::string str();
+
+    double sah() const { 
+      return stat.sah(bvh); 
+    }
+
+    size_t bytesUsed() const {
+      return stat.bytes(bvh);
+    }
+
+  private:
+    Statistics statistics(NodeRef node, const double A, const BBox1f dt);
+
+  private:
+    BVH* bvh;
+    Statistics stat;
+  };
+
+  typedef BVHNStatistics<4> BVH4Statistics;
+  typedef BVHNStatistics<8> BVH8Statistics;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
new file mode 100644
index 0000000000..7f17084b81
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
@@ -0,0 +1,676 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "node_intersector1.h"
+#include "../common/stack_item.h"
+
+#define NEW_SORTING_CODE 1
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH regular node traversal for single rays. */
+    template<int N, int Nx, int types>
+    class BVHNNodeTraverser1Hit;
+
+    /*! Helper functions for fast sorting using AVX512 instructions. */
+#if defined(__AVX512ER__)
+
+    /* KNL code path */
+    __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+    {
+      const vfloat16 dist_shift = align_shift_right<15>(dist,dist);
+      const vllong8  ptr_shift  = align_shift_right<7>(ptr,ptr);
+      const vbool16 m_geq = d >= dist;
+      const vbool16 m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      ptr  = select(vboold8(m_geq),p,ptr);
+      dist = select(m_geq_shift,dist_shift,dist);
+      ptr  = select(vboold8(m_geq_shift),ptr_shift,ptr);
+    }
+
+    __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+    {
+      //dist = align_shift_right<15>(dist,d);
+      //ptr  = align_shift_right<7>(ptr,p);
+      dist = align_shift_right<15>(dist,permute(d,vint16(zero)));
+      ptr  = align_shift_right<7>(ptr,permute(p,vllong8(zero)));
+    }
+
+    template<int N, int Nx, int types, class NodeRef, class BaseNode>
+    __forceinline void traverseClosestHitAVX512(NodeRef& cur,
+                                                size_t mask,
+                                                const vfloat<Nx>& tNear,
+                                                StackItemT<NodeRef>*& stackPtr,
+                                                StackItemT<NodeRef>* stackEnd)
+    {
+      assert(mask != 0);
+      const BaseNode* node = cur.baseNode();
+
+      vllong8 children( vllong<N>::loadu((void*)node->children) );
+      children = vllong8::compact((int)mask,children);
+      vfloat16 distance = tNear;
+      distance = vfloat16::compact((int)mask,distance,tNear);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) return;
+
+      /* 2 hits: order A0 B0 */
+      const vllong8 c0(children);
+      const vfloat16 d0(distance);
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+      const vllong8 c1(children);
+      const vfloat16 d1(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */
+      const vboolf16 m_dist  = d0 < d1;
+      const vfloat16 dist_A0 = select(m_dist, d0, d1);
+      const vfloat16 dist_B0 = select(m_dist, d1, d0);
+      const vllong8 ptr_A0   = select(vboold8(m_dist), c0, c1);
+      const vllong8 ptr_B0   = select(vboold8(m_dist), c1, c0);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A0);
+        stackPtr[0].ptr            = toScalar(ptr_B0);
+        *(float*)&stackPtr[0].dist = toScalar(dist_B0);
+        stackPtr++;
+        return;
+      }
+
+      /* 3 hits: order A1 B1 C1 */
+
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+
+      const vllong8 c2(children);
+      const vfloat16 d2(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      const vboolf16 m_dist1     = dist_A0 <= d2;
+      const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0);
+      const vllong8  ptr_A1      = select(vboold8(m_dist1), ptr_A0, c2);
+      const vllong8  ptr_tmp_B1  = select(vboold8(m_dist1), c2, ptr_A0);
+
+      const vboolf16 m_dist2     = dist_B0 <= dist_tmp_B1;
+      const vfloat16 dist_B1     = select(m_dist2, dist_B0 , dist_tmp_B1);
+      const vfloat16 dist_C1     = select(m_dist2, dist_tmp_B1, dist_B0);
+      const vllong8  ptr_B1      = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1);
+      const vllong8  ptr_C1      = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A1);
+        stackPtr[0].ptr  = toScalar(ptr_C1);
+        *(float*)&stackPtr[0].dist = toScalar(dist_C1);
+        stackPtr[1].ptr  = toScalar(ptr_B1);
+        *(float*)&stackPtr[1].dist = toScalar(dist_B1);
+        stackPtr+=2;
+        return;
+      }
+
+      /* 4 hits: order A2 B2 C2 D2 */
+
+      const vfloat16 dist_A1  = select(m_dist1, dist_A0, d2);
+
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+
+      const vllong8 c3(children);
+      const vfloat16 d3(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      const vboolf16 m_dist3     = dist_A1 <= d3;
+      const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1);
+      const vllong8  ptr_A2      = select(vboold8(m_dist3), ptr_A1, c3);
+      const vllong8  ptr_tmp_B2  = select(vboold8(m_dist3), c3, ptr_A1);
+
+      const vboolf16 m_dist4     = dist_B1 <= dist_tmp_B2;
+      const vfloat16 dist_B2     = select(m_dist4, dist_B1 , dist_tmp_B2);
+      const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1);
+      const vllong8  ptr_B2      = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2);
+      const vllong8  ptr_tmp_C2  = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1);
+
+      const vboolf16 m_dist5     = dist_C1 <= dist_tmp_C2;
+      const vfloat16 dist_C2     = select(m_dist5, dist_C1 , dist_tmp_C2);
+      const vfloat16 dist_D2     = select(m_dist5, dist_tmp_C2, dist_C1);
+      const vllong8  ptr_C2      = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2);
+      const vllong8  ptr_D2      = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A2);
+        stackPtr[0].ptr  = toScalar(ptr_D2);
+        *(float*)&stackPtr[0].dist = toScalar(dist_D2);
+        stackPtr[1].ptr  = toScalar(ptr_C2);
+        *(float*)&stackPtr[1].dist = toScalar(dist_C2);
+        stackPtr[2].ptr  = toScalar(ptr_B2);
+        *(float*)&stackPtr[2].dist = toScalar(dist_B2);
+        stackPtr+=3;
+        return;
+      }
+
+      /* >=5 hits: reverse to descending order for writing to stack */
+
+      const size_t hits = 4 + popcnt(mask);
+      const vfloat16 dist_A2  = select(m_dist3, dist_A1, d3);
+      vfloat16 dist(neg_inf);
+      vllong8 ptr(zero);
+
+
+      isort_quick_update(dist,ptr,dist_A2,ptr_A2);
+      isort_quick_update(dist,ptr,dist_B2,ptr_B2);
+      isort_quick_update(dist,ptr,dist_C2,ptr_C2);
+      isort_quick_update(dist,ptr,dist_D2,ptr_D2);
+
+      do {
+
+        children = align_shift_right<1>(children,children);
+        distance = align_shift_right<1>(distance,distance);
+
+        cur = toScalar(children);
+        BVHN<N>::prefetch(cur,types);
+
+        const vfloat16 new_dist(permute(distance,vint16(zero)));
+        const vllong8 new_ptr(permute(children,vllong8(zero)));
+
+        mask &= mask-1;
+        isort_update(dist,ptr,new_dist,new_ptr);
+
+      } while(mask);
+
+      const vboold8 m_stack_ptr(0x55);  // 10101010 (lsb -> msb)
+      const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb)
+
+      /* extract current noderef */
+      cur = toScalar(permute(ptr,vllong8(hits-1)));
+      /* rearrange pointers to beginning of 16 bytes block */
+      vllong8 stackElementA0;
+      stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0);
+      /* put distances in between */
+      vuint16 stackElementA1((__m512i)stackElementA0);
+      stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1);
+      /* write out first 4 x 16 bytes block to stack */
+      vuint16::storeu(stackPtr,stackElementA1);
+      /* get upper half of dist and ptr */
+      dist = align_shift_right<4>(dist,dist);
+      ptr  = align_shift_right<4>(ptr,ptr);
+      /* assemble and write out second block */
+      vllong8 stackElementB0;
+      stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0);
+      vuint16 stackElementB1((__m512i)stackElementB0);
+      stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1);
+      vuint16::storeu(stackPtr + 4,stackElementB1);
+      /* increase stack pointer */
+      stackPtr += hits-1;
+    }
+#endif
+
+#if defined(__AVX512VL__) // SKX
+
+    template<int N>
+    __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
+    {
+      const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
+      const vboolf<N> m_geq = d >= dist;
+      const vboolf<N> m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      dist = select(m_geq_shift,dist_shift,dist);
+    }
+
+    template<int N>
+    __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
+      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+    }
+
+    __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
+      return toScalar(permutex2var((__m256i)index,n0,n1));
+    }
+
+    __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
+      return toScalar(permute(n,index));
+    }
+
+#endif
+
+    /* Specialization for BVH4. */
+    template<int Nx, int types>
+    class BVHNNodeTraverser1Hit<4, Nx, types>
+    {
+      typedef BVH4 BVH;
+      typedef BVH4::NodeRef NodeRef;
+      typedef BVH4::BaseNode BaseNode;
+
+
+    public:
+      /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat<Nx>& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512ER__)
+        traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+        cur = toSizeT(s3);
+        stackPtr+=3;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat<Nx>& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r); 
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+
+    /* Specialization for BVH8. */
+    template<int Nx, int types>
+    class BVHNNodeTraverser1Hit<8, Nx, types>
+    {
+      typedef BVH8 BVH;
+      typedef BVH8::NodeRef NodeRef;
+      typedef BVH8::BaseNode BaseNode;
+      
+#if defined(__AVX512VL__)
+      template<class NodeRef, class BaseNode>
+        static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
+                                                              size_t mask,
+                                                              const vfloat8& tNear,
+                                                              StackItemT<NodeRef>*& stackPtr,
+                                                              StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+        const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
+        const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
+        vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
+        distance_i = vint8::compact((int)mask,distance_i,distance_i);
+        cur = permuteExtract(distance_i,n0,n1);
+        BVH::prefetch(cur,types);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) return;
+
+        /* 2 hits: order A0 B0 */
+        const vint8 d0(distance_i);
+        const vint8 d1(shuffle<1>(distance_i));
+        cur = permuteExtract(d1,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A0 = min(d0, d1);
+        const vint8 dist_B0 = max(d0, d1);
+        assert(dist_A0[0] < dist_B0[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A0,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_B0,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
+          stackPtr++;
+          return;
+        }
+
+        /* 3 hits: order A1 B1 C1 */
+
+        const vint8 d2(shuffle<2>(distance_i));
+        cur = permuteExtract(d2,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A1     = min(dist_A0,d2);
+        const vint8 dist_tmp_B1 = max(dist_A0,d2);
+        const vint8 dist_B1     = min(dist_B0,dist_tmp_B1);
+        const vint8 dist_C1     = max(dist_B0,dist_tmp_B1);
+        assert(dist_A1[0] < dist_B1[0]);
+        assert(dist_B1[0] < dist_C1[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A1,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_C1,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_B1,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
+          stackPtr+=2;
+          return;
+        }
+
+        /* 4 hits: order A2 B2 C2 D2 */
+
+        const vint8 d3(shuffle<3>(distance_i));
+        cur = permuteExtract(d3,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A2     = min(dist_A1,d3);
+        const vint8 dist_tmp_B2 = max(dist_A1,d3);
+        const vint8 dist_B2     = min(dist_B1,dist_tmp_B2);
+        const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
+        const vint8 dist_C2     = min(dist_C1,dist_tmp_C2);
+        const vint8 dist_D2     = max(dist_C1,dist_tmp_C2);
+        assert(dist_A2[0] < dist_B2[0]);
+        assert(dist_B2[0] < dist_C2[0]);
+        assert(dist_C2[0] < dist_D2[0]);
+        
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A2,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_D2,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_C2,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
+          stackPtr[2].ptr            = permuteExtract(dist_B2,n0,n1);
+          *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
+          stackPtr+=3;
+          return;
+        }
+
+        /* >=5 hits: reverse to descending order for writing to stack */
+
+        distance_i = align_shift_right<3>(distance_i,distance_i);
+        const size_t hits = 4 + popcnt(mask);
+        vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
+	
+        isort_quick_update(dist,dist_A2);
+        isort_quick_update(dist,dist_B2);
+        isort_quick_update(dist,dist_C2);
+        isort_quick_update(dist,dist_D2);
+
+        do {
+
+          distance_i = align_shift_right<1>(distance_i,distance_i);
+          cur = permuteExtract(distance_i,n0,n1);
+          BVH::prefetch(cur,types);
+          const vint8 new_dist(permute(distance_i,vint8(zero)));
+          mask &= mask-1;
+          isort_update(dist,new_dist);
+
+        } while(mask);
+
+        for (size_t i=0; i<7; i++)
+          assert(dist[i+0]>=dist[i+1]);
+
+        for (size_t i=0;i<hits-1;i++)
+        {
+          stackPtr->ptr            = permuteExtract(dist,n0,n1);
+          *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
+          dist = align_shift_right<1>(dist,dist);
+          stackPtr++;
+        }
+        cur = permuteExtract(dist,n0,n1);
+      }
+#endif
+
+    public:
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat<Nx>& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512ER__)
+        traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#elif defined(__AVX512VL__)
+        traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+          cur = toSizeT(s3);
+          stackPtr+=3;
+          return;
+        }
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr;
+        stackPtr+=4;      
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; 
+          const vint4 s((size_t)c,(size_t)d);
+          *(vint4*)stackPtr++ = s;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr-4;
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat<Nx>& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
new file mode 100644
index 0000000000..9c603babf0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
@@ -0,0 +1,154 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int Nx, int types>
+    class BVHNNodeTraverserStreamHitCoherent
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+
+    public:
+      template<class T>
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t& m_trav_active,
+                                                   const vbool<Nx>& vmask,
+                                                   const vfloat<Nx>& tNear,
+                                                   const T* const tMask,
+                                                   StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        const size_t r0 = bscf(mask);
+        assert(r0 < 8);
+        cur = node->child(r0);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r0];
+        assert(cur != BVH::emptyNode);
+        if (unlikely(mask == 0)) return;
+
+        const unsigned int* const tNear_i = (unsigned int*)&tNear;
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        unsigned int d0 = tNear_i[r0];
+        const size_t r1 = bscf(mask);
+        assert(r1 < 8);
+        NodeRef c1 = node->child(r1);
+        BVHN<N>::prefetch(c1,types);
+        unsigned int d1 = tNear_i[r1];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          if (d0 < d1) {
+            assert(tNear[r1] >= 0.0f);
+            stackPtr->mask    = tMask[r1];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c1;
+            stackPtr++;
+            cur = c0;
+            m_trav_active = tMask[r0];
+            return;
+          }
+          else {
+            assert(tNear[r0] >= 0.0f);
+            stackPtr->mask    = tMask[r0];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c0;
+            stackPtr++;
+            cur = c1;
+            m_trav_active = tMask[r1];
+            return;
+          }
+        }
+
+        /*! slow path for more than two hits */
+        size_t hits = movemask(vmask);
+        const vint<Nx> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<Nx>(step), 0);
+  #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+        const vint<N> tmp = extractN<N,0>(dist_i);
+        const vint<Nx> dist_i_sorted = usort_descending(tmp);
+  #else
+        const vint<Nx> dist_i_sorted = usort_descending(dist_i);
+  #endif
+        const vint<Nx> sorted_index = dist_i_sorted & 7;
+
+        size_t i = 0;
+        for (;;)
+        {
+          const unsigned int index = sorted_index[i];
+          assert(index < 8);
+          cur = node->child(index);
+          m_trav_active = tMask[index];
+          assert(m_trav_active);
+          BVHN<N>::prefetch(cur,types);
+          bscf(hits);
+          if (unlikely(hits==0)) break;
+          i++;
+          assert(cur != BVH::emptyNode);
+          assert(tNear[index] >= 0.0f);
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+
+      template<class T>
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t& m_trav_active,
+                                               const vbool<Nx>& vmask,
+                                               const T* const tMask,
+                                               StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r];
+
+        /* simple in order sequence */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        stackPtr->mask    = m_trav_active;
+        stackPtr->parent  = parent;
+        stackPtr->child   = cur;
+        stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r);
+          BVHN<N>::prefetch(cur,types);
+          m_trav_active = tMask[r];
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
new file mode 100644
index 0000000000..a978c0c459
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct NearFarPrecalculations
+    {
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+
+      __forceinline NearFarPrecalculations() {}
+
+      __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N)
+      {
+        const size_t size = sizeof(float)*N;
+        nearX = (dir.x < 0.0f) ? 1*size : 0*size;
+        nearY = (dir.y < 0.0f) ? 3*size : 2*size;
+        nearZ = (dir.z < 0.0f) ? 5*size : 4*size;
+        farX  = nearX ^ size;
+        farY  = nearY ^ size;
+        farZ  = nearZ ^ size;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
new file mode 100644
index 0000000000..aa0d4ba4d7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
@@ -0,0 +1,1788 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      struct TravRayBase;
+      
+    /* Base (without tnear and tfar) */
+    template<int N, int Nx>
+      struct TravRayBase<N,Nx,false>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const Vec3fa ray_rdir = rcp_safe(ray_dir);
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
+#if defined(__FMA_X4__)
+        const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
+        org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+          //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+          //x86 will use msub
+        neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
+#endif
+        nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+
+      }
+
+      template<int K>
+      __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                                const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                                size_t flip = sizeof(vfloat<N>))
+      {
+        org  = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+        org_rdir = org*rdir;
+#else
+        neg_org_rdir = -(org*rdir);
+#endif
+#endif
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<Nx> org, dir, rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+      Vec3vf<Nx> org_rdir;
+#else
+        //aarch64 version are keeping negation of the org_rdir and use madd
+        //x86 uses msub
+      Vec3vf<Nx> neg_org_rdir;
+#endif
+#endif
+#if defined(__AVX512ER__) // KNL+
+      vint16 permX, permY, permZ;
+#endif
+
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Base (without tnear and tfar) */
+    template<int N, int Nx>
+      struct TravRayBase<N,Nx,true>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const float round_down = 1.0f-3.0f*float(ulp);
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir);
+        const Vec3fa ray_rdir_near = round_down*ray_rdir;
+        const Vec3fa ray_rdir_far  = round_up  *ray_rdir;
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
+        rdir_far  = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
+        nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      template<int K>
+      __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                                const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                                size_t flip = sizeof(vfloat<N>))
+      {
+        const vfloat<Nx> round_down = 1.0f-3.0f*float(ulp);
+        const vfloat<Nx> round_up   = 1.0f+3.0f*float(ulp);
+        org  = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir_near = round_down*Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+        rdir_far  = round_up  *Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<Nx> org, dir, rdir_near, rdir_far;
+#if defined(__AVX512ER__) // KNL+
+      vint16 permX, permY, permZ;
+#endif
+
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Full (with tnear and tfar) */
+    template<int N, int Nx, bool robust>
+      struct TravRay : TravRayBase<N,Nx,robust>
+    {
+      __forceinline TravRay() {}
+
+      __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar)
+        : TravRayBase<N,Nx,robust>(ray_org, ray_dir),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      template<int K>
+      __forceinline TravRay(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                            const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                            float ray_tnear, float ray_tfar,
+                            size_t flip = sizeof(vfloat<N>))
+        : TravRayBase<N,Nx,robust>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      vfloat<Nx> tnear;
+      vfloat<Nx> tfar;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Point Query structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    struct TravPointQuery
+    {
+      __forceinline TravPointQuery() {}
+
+      __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad)
+      {
+        org = Vec3vf<N>(query_org.x, query_org.y, query_org.z);
+        rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z);
+      }
+
+      __forceinline vfloat<N> const& tfar() const {
+        return rad.x;
+      }
+
+      Vec3vf<N> org, rad;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // point query
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t pointQuerySphereDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> vmask = dist <= query.tfar()*query.tfar();
+      const vbool<N> valid = minX <= maxX;
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeSphere(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryAABBDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> valid = minX <= maxX;
+      const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) |
+                               (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) |
+                               (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z));
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeAABB(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+    {
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__aarch64__)
+      const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+      const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+    {
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+        
+#else
+      const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+    {
+      const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);      
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+    {
+      const vllong8 invalid((size_t)BVH8::emptyNode);
+      const vboold8 m_valid(invalid != vllong8::loadu(node->children));
+      const vfloat16 bminmaxX  = permute(vfloat16::load((const float*)&node->lower_x), ray.permX);
+      const vfloat16 bminmaxY  = permute(vfloat16::load((const float*)&node->lower_y), ray.permY);
+      const vfloat16 bminmaxZ  = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ);
+      const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+    
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+    {
+      const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist)
+    {      
+      const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask = le((1 << 4)-1,tNear,tFar);
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist)
+    {      
+      const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask = le((1 << 8)-1,tNear,tFar);
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<N)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      const size_t mask = movemask(tNear <= tFar);
+      dist = tNear;
+      return mask;
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+        
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
+      const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
+      const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+#endif
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat16 start_x(node->start.x);
+      const vfloat16 scale_x(node->scale.x);
+      const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+      const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX  >> 2)),scale_x,start_x);
+      const vfloat16 start_y(node->start.y);
+      const vfloat16 scale_y(node->scale.y);
+      const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+      const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY  >> 2)),scale_y,start_y);
+      const vfloat16 start_z(node->start.z);
+      const vfloat16 scale_z(node->scale.z);
+      const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+      const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ  >> 2)),scale_z,start_z);
+
+      const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);      
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask) & mvalid;
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat16 start_x(node->start.x);
+      const vfloat16 scale_x(node->scale.x);
+      const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+      const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX  >> 2)),scale_x,start_x);
+      const vfloat16 start_y(node->start.y);
+      const vfloat16 scale_y(node->scale.y);
+      const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+      const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY  >> 2)),scale_y,start_y);
+      const vfloat16 start_z(node->start.z);
+      const vfloat16 scale_z(node->scale.z);
+      const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+      const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ  >> 2)),scale_z,start_z);
+
+      const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask) & mvalid;
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+    {
+      const vbool16 m_valid(node->validMask16());
+      const vfloat16 bminmaxX  = node->dequantizeLowerUpperX(ray.permX);
+      const vfloat16 bminmaxY  = node->dequantizeLowerUpperY(ray.permY);
+      const vfloat16 bminmaxZ  = node->dequantizeLowerUpperZ(ray.permZ);
+      const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist)
+    {
+      const vbool16 m_valid(node->validMask16());
+      const vfloat16 bminmaxX  = node->dequantizeLowerUpperX(ray.permX);
+      const vfloat16 bminmaxY  = node->dequantizeLowerUpperY(ray.permY);
+      const vfloat16 bminmaxZ  = node->dequantizeLowerUpperZ(ray.permZ);
+      const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!!
+      const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    
+#endif
+
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif      
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+
+#if defined(__AVX512ER__)
+    // for KNL
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist)
+    {
+      const size_t  mvalid    = movemask(node->validMask());
+      const vfloat16 lower_x  = node->dequantizeLowerX(time);
+      const vfloat16 upper_x  = node->dequantizeUpperX(time);
+      const vfloat16 lower_y  = node->dequantizeLowerY(time);
+      const vfloat16 upper_y  = node->dequantizeUpperY(time);
+      const vfloat16 lower_z  = node->dequantizeLowerZ(time);
+      const vfloat16 upper_z  = node->dequantizeUpperZ(time);     
+
+      const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+
+      const vfloat16 tminX = min(tNearX,tFarX);
+      const vfloat16 tmaxX = max(tNearX,tFarX);
+      const vfloat16 tminY = min(tNearY,tFarY);
+      const vfloat16 tmaxY = max(tNearY,tFarY);
+      const vfloat16 tminZ = min(tNearZ,tFarZ);
+      const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+      const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+      const vfloat16 tFar  = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+      const vbool16 vmask =  tNear <= tFar;
+      const size_t mask = movemask(vmask) & mvalid;
+      dist = extractN<4,0>(tNear);
+      return mask;      
+    }
+
+
+    // for KNL
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist)
+    {
+      const size_t  mvalid    = movemask(node->validMask());
+      const vfloat16 lower_x  = node->dequantizeLowerX(time);
+      const vfloat16 upper_x  = node->dequantizeUpperX(time);
+      const vfloat16 lower_y  = node->dequantizeLowerY(time);
+      const vfloat16 upper_y  = node->dequantizeUpperY(time);
+      const vfloat16 lower_z  = node->dequantizeLowerZ(time);
+      const vfloat16 upper_z  = node->dequantizeUpperZ(time);     
+
+      const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat16 tminX = min(tNearX,tFarX);
+      const vfloat16 tmaxX = max(tNearX,tFarX);
+      const vfloat16 tminY = min(tNearY,tFarY);
+      const vfloat16 tmaxY = max(tNearY,tFarY);
+      const vfloat16 tminZ = min(tNearZ,tFarZ);
+      const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+      const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+      const vfloat16 tFar  = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+      const vbool16 vmask =  tNear <= tFar;
+      const size_t mask = movemask(vmask) & mvalid;
+      dist = extractN<4,0>(tNear);
+      return mask;      
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,N,robust>& ray, vfloat<N>& dist)
+    {
+      const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir);
+      //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir;
+      const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(node->naabb,ray.org);
+      const Vec3vf<N> tLowerXYZ = org * nrdir;       // (Vec3fa(zero) - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,N,robust>& ray, const float time, vfloat<N>& dist)
+    {
+      const AffineSpace3vf<N> xfm = node->space0;
+      const Vec3vf<N> b0_lower = zero;
+      const Vec3vf<N> b0_upper = one;
+      const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time));
+      const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time));
+
+      const BBox3vf<N> bounds(lower,upper);
+      const Vec3vf<N> dir = xfmVector(xfm,ray.dir);
+      const Vec3vf<N> rdir = rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(xfm,ray.org);
+
+      const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in point query raversal
+    //////////////////////////////////////////////////////////////////////////////////////
+    
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQuerySphere1;
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeSphereMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQuerySphere1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,time,dist);
+      }
+    };
+
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQueryAABB1;
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeAABBMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQueryAABB1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,time,dist);
+      }
+    };
+
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with 1 ray */
+    template<int N, int Nx, int types, bool robust>
+    struct BVHNNodeIntersector1;
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4D<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4DRobust<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNode(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4D(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4DRobust(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_QN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_QN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int Nx, bool robust>
+      struct BVHNQuantizedBaseNodeIntersector1;
+
+    template<int N, int Nx>
+      struct BVHNQuantizedBaseNodeIntersector1<N, Nx, false>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,false>& ray, vfloat<Nx>& dist)
+      {
+        return intersectNode(node,ray,dist);
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int Nx>
+      struct BVHNQuantizedBaseNodeIntersector1<N, Nx, true>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+      {
+        return intersectNode(node,ray,dist); 
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
new file mode 100644
index 0000000000..800ac8b478
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
@@ -0,0 +1,269 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Frustum structure used in hybrid and stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*
+       Optimized frustum test. We calculate t=(p-org)/dir in ray/box
+       intersection. We assume the rays are split by octant, thus
+       dir intervals are either positive or negative in each
+       dimension.
+
+       Case 1: dir.min >= 0 && dir.max >= 0:
+         t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+         t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+
+       Case 2: dir.min < 0 && dir.max < 0:
+         t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+         t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+    */
+
+    template<bool robust>
+    struct Frustum;
+    
+    /* Fast variant */
+    template<>
+    struct Frustum<false>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(valid, org, rdir, ray_tnear, ray_tfar, N);
+      }
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+#if defined (__aarch64__)
+        neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+        neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
+        min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
+#endif
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+#if defined (__aarch64__)
+      Vec3fa neg_min_org_rdir;
+      Vec3fa neg_max_org_rdir;
+#else
+      Vec3fa min_org_rdir;
+      Vec3fa max_org_rdir;
+#endif
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<false> FrustumFast;
+
+    /* Robust variant */
+    template<>
+    struct Frustum<true>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(valid, org, rdir, ray_tnear, ray_tfar, N);
+      }
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org = select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org = select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org;
+      Vec3fa max_org;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<true> FrustumRobust;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumFast& frustum, vfloat<Nx>& dist)
+    {
+      const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+#if defined (__aarch64__)
+      const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
+      const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
+      const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
+      const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
+      const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
+      const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
+#else
+      const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
+      const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
+      const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
+      const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
+      const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
+      const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
+#endif
+      const vfloat<Nx> fmin  = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<Nx> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+      const vbool<Nx> vmask_node_hit = fmin <= fmax;
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumRobust& frustum, vfloat<Nx>& dist)
+    {
+      const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<Nx> fminX = (bminX - vfloat<Nx>(frustum.min_org.x)) * vfloat<Nx>(frustum.min_rdir.x);
+      const vfloat<Nx> fminY = (bminY - vfloat<Nx>(frustum.min_org.y)) * vfloat<Nx>(frustum.min_rdir.y);
+      const vfloat<Nx> fminZ = (bminZ - vfloat<Nx>(frustum.min_org.z)) * vfloat<Nx>(frustum.min_rdir.z);
+      const vfloat<Nx> fmaxX = (bmaxX - vfloat<Nx>(frustum.max_org.x)) * vfloat<Nx>(frustum.max_rdir.x);
+      const vfloat<Nx> fmaxY = (bmaxY - vfloat<Nx>(frustum.max_org.y)) * vfloat<Nx>(frustum.max_rdir.y);
+      const vfloat<Nx> fmaxZ = (bmaxZ - vfloat<Nx>(frustum.max_org.z)) * vfloat<Nx>(frustum.max_rdir.z);
+
+      const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const float round_up   = 1.0f+2.0f*float(ulp);
+      const vfloat<Nx> fmin  = max(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<Nx> fmax  = min(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+      const vbool<Nx> vmask_node_hit = (round_down*fmin <= round_up*fmax);
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
new file mode 100644
index 0000000000..0543e56f8e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
@@ -0,0 +1,843 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayK<K, false>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
+        org_rdir = org * rdir;
+#endif
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
+      Vec3vf<K> org_rdir;
+#endif
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKFast = TravRayK<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayK<K, true>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKRobust = TravRayK<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i,
+                                         const TravRayKFast<K>& ray, vfloat<K>& dist)
+
+    {
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i,
+                                               const TravRayKRobust<K>& ray, vfloat<K>& dist)
+    {
+      // FIXME: use per instruction rounding for AVX512
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit   = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                         const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+#else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+#endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                               const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                             const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+      const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                                    const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]),
+                                    Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]),
+                                    Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]),
+                                    Vec3f(node->naabb.p   .x[i], node->naabb.p   .y[i], node->naabb.p   .z[i]));
+
+      const Vec3vf<K> dir = xfmVector(naabb, ray.dir);
+      const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1?
+      const Vec3vf<K> org = xfmPoint(naabb, ray.org);
+
+      const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir;
+      const vfloat<K> lclipMinY = org.y * nrdir.y;
+      const vfloat<K> lclipMinZ = org.z * nrdir.z;
+      const vfloat<K> lclipMaxX  = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir;
+      const vfloat<K> lclipMaxY  = lclipMinY - nrdir.y;
+      const vfloat<K> lclipMaxZ  = lclipMinZ - nrdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]),
+                                  Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]),
+                                  Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]),
+                                  Vec3f(node->space0.p   .x[i], node->space0.p   .y[i], node->space0.p   .z[i]));
+
+      const Vec3vf<K> b0_lower = zero;
+      const Vec3vf<K> b0_upper = one;
+      const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]);
+      const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]);
+      const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time);
+      const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time);
+
+      const Vec3vf<K> dir = xfmVector(xfm, ray.dir);
+      const Vec3vf<K> rdir = rcp_safe(dir);
+      const Vec3vf<K> org = xfmPoint(xfm, ray.org);
+
+      const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x;
+      const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y;
+      const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z;
+      const vfloat<K> lclipMaxX  = (upper.x - org.x) * rdir.x;
+      const vfloat<K> lclipMaxY  = (upper.y - org.y) * rdir.y;
+      const vfloat<K> lclipMaxZ  = (upper.z - org.z) * rdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+        
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+                                                   const TravRayK<K,false>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+  #if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+  #elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+          const TravRayK<K,true>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+      const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+      }
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+        const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+        const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+        const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+        const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+        const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+        
+#if defined(__aarch64__)
+        const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+        const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+        const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+        const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+  #endif
+        const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+        const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+        const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+        const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+        const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+        const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const float round_down = 1.0f-3.0f*float(ulp);
+
+        const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, int types, bool robust>
+    struct BVHNNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, false>
+    {
+      /* vmask is both an input and an output parameter! Its initial value should be the parent node
+         hit mask, which is used for correctly computing the current hit mask. The parent hit mask
+         is actually required only for motion blur node intersections (because different rays may
+         have different times), so for regular nodes vmask is simply overwritten. */
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, bool robust>
+    struct BVHNQuantizedBaseNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, false>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                              const TravRayK<K,false>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+                                               const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, true>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                               const TravRayK<K,true>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
new file mode 100644
index 0000000000..f379b57aea
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayKStream;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayKStream<K, false>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(ray_org * rdir);
+#else
+        org_rdir = ray_org * rdir;
+#endif
+      }
+
+      Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#else
+      Vec3vf<K> org_rdir;
+#endif
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamFast = TravRayKStream<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayKStream<K, true>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+        org = ray_org;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamRobust = TravRayKStream<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+#if defined (__aarch64__)
+      const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+      const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+#else
+      const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+      const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+      const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+      const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+      const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+      const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+#endif
+      const vfloat<Nx> rmin  = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+      const vfloat<Nx> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+      const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char* ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+#if defined (__aarch64__)
+      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
+
+      const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
+      const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<Nx> rminX = (bminX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+      const vfloat<Nx> rminY = (bminY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+      const vfloat<Nx> rminZ = (bminZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+      const vfloat<Nx> rmaxX = (bmaxX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+      const vfloat<Nx> rmaxY = (bmaxY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+      const vfloat<Nx> rmaxZ = (bmaxZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+      const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const vfloat<Nx> rmin  =            max(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+      const vfloat<Nx> rmax  = round_up  *min(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+      const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char *ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
+      const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
+
+      const float round_up  = 1.0f+3.0f*float(ulp);
+      const vfloat<K> rmin  =            max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
+      const vfloat<K> rmax  = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h
new file mode 100644
index 0000000000..c038d3cf21
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+  class Scene;
+
+  /*! Base class for the acceleration structure data. */
+  class AccelData : public RefCount 
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+  public:
+    AccelData (const Type type) 
+      : bounds(empty), type(type) {}
+
+    /*! notifies the acceleration structure about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+   
+    /*! clears the acceleration structure data */
+    virtual void clear() = 0;
+
+    /*! returns normal bounds */
+    __forceinline BBox3fa getBounds() const {
+      return bounds.bounds();
+    }
+
+    /*! returns bounds for some time */
+    __forceinline BBox3fa getBounds(float t) const {
+      return bounds.interpolate(t);
+    }
+
+    /*! returns linear bounds */
+    __forceinline LBBox3fa getLinearBounds() const {
+      return bounds;
+    }
+
+    /*! checks if acceleration structure is empty */
+    __forceinline bool isEmpty() const {
+      return bounds.bounds0.lower.x == float(pos_inf);
+    }
+
+  public:
+    LBBox3fa bounds; // linear bounds
+    Type type;
+  };
+
+  /*! Base class for all intersectable and buildable acceleration structures. */
+  class Accel : public AccelData
+  {
+     ALIGNED_CLASS_(16);
+  public:
+
+    struct Intersectors;
+
+    /*! Type of collide function */
+    typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+    /*! Type of point query function */
+    typedef bool(*PointQueryFunc)(Intersectors* This,          /*!< this pointer to accel */
+                                  PointQuery* query,        /*!< point query for lookup */
+                                  PointQueryContext* context); /*!< point query context */
+
+    /*! Type of intersect function pointer for single rays. */
+    typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
+                                  RTCRayHit& ray,      /*!< ray to intersect */
+                                  IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 4. */
+    typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit4& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 8. */
+    typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit8& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 16. */
+    typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRayHit16& ray,   /*!< ray packet to intersect */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
+                                   const size_t N,     /*!< number of rays in stream */
+                                   IntersectContext* context /*!< layout flags */);
+    
+    
+    /*! Type of occlusion function pointer for single rays. */
+    typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+                                  RTCRay& ray,        /*!< ray to test occlusion */
+                                  IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 4. */
+    typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay4& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 8. */
+    typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay8& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 16. */
+    typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRay16& ray,      /*!< ray packet to test occlusion. */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
+                                  const size_t N,     /*!< number of rays in stream */
+                                  IntersectContext* context /*!< layout flags */);
+    typedef void (*ErrorFunc) ();
+
+    struct Collider
+    {
+      Collider (ErrorFunc error = nullptr) 
+      : collide((CollideFunc)error), name(nullptr) {}
+
+      Collider (CollideFunc collide, const char* name)
+      : collide(collide), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      CollideFunc collide;  
+      const char* name;
+    };
+    
+    struct Intersector1
+    {
+      Intersector1 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      static const char* type;
+      IntersectFunc intersect;
+      OccludedFunc occluded;
+      PointQueryFunc pointQuery;
+      const char* name;
+    };
+    
+    struct Intersector4 
+    {
+      Intersector4 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+      Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc4 intersect;
+      OccludedFunc4 occluded;
+      const char* name;
+    };
+    
+    struct Intersector8 
+    {
+      Intersector8 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+      Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc8 intersect;
+      OccludedFunc8 occluded;
+      const char* name;
+    };
+    
+    struct Intersector16 
+    {
+      Intersector16 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+      Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc16 intersect;
+      OccludedFunc16 occluded;
+      const char* name;
+    };
+
+    struct IntersectorN 
+    {
+      IntersectorN (ErrorFunc error = nullptr)
+      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFuncN intersect;
+      OccludedFuncN occluded;
+      const char* name;
+    };
+   
+    struct Intersectors 
+    {
+      Intersectors() 
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+      Intersectors (ErrorFunc error) 
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+      void print(size_t ident) 
+      {
+        if (collider.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "collider  = " << collider.name << std::endl;
+        }
+        if (intersector1.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector1  = " << intersector1.name << std::endl;
+        }
+        if (intersector4.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector4  = " << intersector4.name << std::endl;
+        }
+        if (intersector8.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector8  = " << intersector8.name << std::endl;
+        }
+        if (intersector16.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector16 = " << intersector16.name << std::endl;
+        }
+        if (intersectorN.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersectorN = " << intersectorN.name << std::endl;
+        }        
+      }
+
+      void select(bool filter)
+      {
+        if (intersector4_filter) {
+          if (filter) intersector4 = intersector4_filter;
+          else        intersector4 = intersector4_nofilter;
+        }
+        if (intersector8_filter) {
+          if (filter) intersector8 = intersector8_filter;
+          else        intersector8 = intersector8_nofilter;
+        }
+        if (intersector16_filter) {
+          if (filter) intersector16 = intersector16_filter;
+          else         intersector16 = intersector16_nofilter;
+        }
+        if (intersectorN_filter) {
+          if (filter) intersectorN = intersectorN_filter;
+          else        intersectorN = intersectorN_nofilter;
+        }        
+      }
+
+      __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+        assert(intersector1.pointQuery);
+        return intersector1.pointQuery(this,query,context);
+      }
+
+      /*! collides two scenes */
+      __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+        assert(collider.collide);
+        collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+      }
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+        assert(intersector1.intersect);
+        intersector1.intersect(this,ray,context);
+      }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a stream of N rays in SOA layout with the scene. */
+      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.intersect);
+        intersectorN.intersect(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        intersect4(&mask,(RTCRayHit4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        intersect8(&mask,(RTCRayHit8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        intersect16(&mask,(RTCRayHit16&)ray,context);
+      }
+#endif
+      
+      template<int K>
+      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        intersectN((RTCRayHitN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+        assert(intersector1.occluded);
+        intersector1.occluded(this,ray,context);
+      }
+      
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.occluded);
+        intersectorN.occluded(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        occluded4(&mask,(RTCRay4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        occluded8(&mask,(RTCRay8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        occluded16(&mask,(RTCRay16&)ray,context);
+      }
+#endif
+
+      template<int K>
+      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        occludedN((RTCRayN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+        occluded(ray, context);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+        occluded(valid, ray, context);
+      }
+
+      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+      template<int K>
+      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+        occludedN(rayN, N, context);
+      }
+      
+    public:
+      AccelData* ptr;
+      void* leafIntersector;
+      Collider collider;
+      Intersector1 intersector1;
+      Intersector4 intersector4;
+      Intersector4 intersector4_filter;
+      Intersector4 intersector4_nofilter;
+      Intersector8 intersector8;
+      Intersector8 intersector8_filter;
+      Intersector8 intersector8_nofilter;
+      Intersector16 intersector16;
+      Intersector16 intersector16_filter;
+      Intersector16 intersector16_nofilter;
+      IntersectorN intersectorN;
+      IntersectorN intersectorN_filter;
+      IntersectorN intersectorN_nofilter;      
+    };
+  
+  public:
+
+    /*! Construction */
+    Accel (const AccelData::Type type) 
+      : AccelData(type) {}
+    
+    /*! Construction */
+    Accel (const AccelData::Type type, const Intersectors& intersectors) 
+      : AccelData(type), intersectors(intersectors) {}
+
+    /*! Virtual destructor */
+    virtual ~Accel() {}
+
+    /*! makes the acceleration structure immutable */
+    virtual void immutable () {}
+    
+    /*! build acceleration structure */
+    virtual void build () = 0;
+
+  public:
+    Intersectors intersectors;
+  };
+
+#define DEFINE_COLLIDER(symbol,collider)                                \
+  Accel::Collider symbol() {                                            \
+    return Accel::Collider((Accel::CollideFunc)collider::collide,       \
+                           TOSTRING(isa) "::" TOSTRING(symbol));        \
+  }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector)                               \
+  Accel::Intersector1 symbol() {                                              \
+    return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+                               (Accel::OccludedFunc  )intersector::occluded,  \
+                               (Accel::PointQueryFunc)intersector::pointQuery,\
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR4(symbol,intersector)                               \
+  Accel::Intersector4 symbol() {                                              \
+    return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+                               (Accel::OccludedFunc4)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR8(symbol,intersector)                               \
+  Accel::Intersector8 symbol() {                                              \
+    return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+                               (Accel::OccludedFunc8)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector)                                \
+  Accel::Intersector16 symbol() {                                               \
+    return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+                                (Accel::OccludedFunc16)intersector::occluded,   \
+                                TOSTRING(isa) "::" TOSTRING(symbol));           \
+  }
+
+#define DEFINE_INTERSECTORN(symbol,intersector)                               \
+  Accel::IntersectorN symbol() {                                              \
+    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+                               (Accel::OccludedFuncN)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+  /* ray stream filter interface */
+  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+  struct RayStreamFilterFuncs
+  {
+    RayStreamFilterFuncs()
+    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
+
+    RayStreamFilterFuncs(void (*ptr) ())
+    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
+
+    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
+    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
+
+  public:
+    intersectStreamAOS_func intersectAOS;
+    intersectStreamAOP_func intersectAOP;
+    intersectStreamSOA_func intersectSOA;
+    intersectStreamSOP_func intersectSOP;
+
+    occludedStreamAOS_func occludedAOS;
+    occludedStreamAOP_func occludedAOP;
+    occludedStreamSOA_func occludedSOA;
+    occludedStreamSOP_func occludedSOP;
+  }; 
+
+  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelinstance.h b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..d74b96df3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+  class AccelInstance : public Accel
+  {
+  public:
+    AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+      : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+    void immutable () {
+      builder.reset(nullptr);
+    }
+
+  public:
+    void build () {
+      if (builder) builder->build();
+      bounds = accel->bounds;
+    }
+
+    void deleteGeometry(size_t geomID) {
+      if (accel  ) accel->deleteGeometry(geomID);
+      if (builder) builder->deleteGeometry(geomID);
+    }
+    
+    void clear() {
+      if (accel) accel->clear();
+      if (builder) builder->clear();
+    }
+
+  private:
+    std::unique_ptr<AccelData> accel;
+    std::unique_ptr<Builder> builder;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..aadb4a64ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+  AccelN::AccelN()
+    : Accel(AccelData::TY_ACCELN), accels() {}
+
+  AccelN::~AccelN() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+  }
+
+  void AccelN::accels_add(Accel* accel) 
+  {
+    assert(accel);
+    accels.push_back(accel);
+  }
+
+  void AccelN::accels_init() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+    
+    accels.clear();
+  }
+
+  bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+  {
+    bool changed = false;
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        changed |= This->accels[i]->intersectors.pointQuery(query,context);
+    return changed;
+  }
+
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect(ray,context);
+  }
+
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect4(valid,ray,context);
+  }
+
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect8(valid,ray,context);
+  }
+
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect16(valid,ray,context);
+  }
+
+  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersectN(ray,N,context);
+  }
+
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded(ray,context); 
+      if (ray.tfar < 0.0f) break; 
+    }
+  }
+
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON)
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      if (unlikely(none(valid0 & hit0))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      vbool4 valid2 = asBool(((vint4*)valid)[2]);
+      vbool4 hit2   = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+      vbool4 valid3 = asBool(((vint4*)valid)[3]);
+      vbool4 hit3   = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    size_t M = N;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.occludedN(ray,M,context);
+  }
+
+  void AccelN::accels_print(size_t ident)
+  {
+    for (size_t i=0; i<accels.size(); i++)
+    {
+      for (size_t j=0; j<ident; j++) std::cout << " "; 
+      std::cout << "accels[" << i << "]" << std::endl;
+      accels[i]->intersectors.print(ident+2);
+    }
+  }
+
+  void AccelN::accels_immutable()
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      accels[i]->immutable();
+  }
+  
+  void AccelN::accels_build () 
+  {
+    /* reduce memory consumption */
+    accels.shrink_to_fit();
+    
+    /* build all acceleration structures in parallel */
+    parallel_for (accels.size(), [&] (size_t i) { 
+        accels[i]->build();
+      });
+
+    /* create list of non-empty acceleration structures */
+    bool valid1 = true;
+    bool valid4 = true;
+    bool valid8 = true;
+    bool valid16 = true;
+    for (size_t i=0; i<accels.size(); i++) {
+      valid1 &= (bool) accels[i]->intersectors.intersector1;
+      valid4 &= (bool) accels[i]->intersectors.intersector4;
+      valid8 &= (bool) accels[i]->intersectors.intersector8;
+      valid16 &= (bool) accels[i]->intersectors.intersector16;
+    }
+
+    if (accels.size() == 1) {
+      type = accels[0]->type; // FIXME: should just assign entire Accel
+      bounds = accels[0]->bounds;
+      intersectors = accels[0]->intersectors;
+    }
+    else 
+    {
+      type = AccelData::TY_ACCELN;
+      intersectors.ptr = this;
+      intersectors.intersector1  = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+      intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+      intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+      intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+      /*! calculate bounds */
+      bounds = empty;
+      for (size_t i=0; i<accels.size(); i++) 
+        bounds.extend(accels[i]->bounds);
+    }
+  }
+
+  void AccelN::accels_select(bool filter)
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->intersectors.select(filter);
+  }
+
+  void AccelN::accels_deleteGeometry(size_t geomID) 
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->deleteGeometry(geomID);
+  }
+
+  void AccelN::accels_clear()
+  {
+    for (size_t i=0; i<accels.size(); i++) {
+      accels[i]->clear();
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h
new file mode 100644
index 0000000000..2edd98f647
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+  /*! merges N acceleration structures together, by processing them in order */
+  class AccelN : public Accel
+  {
+  public:
+    AccelN ();
+    ~AccelN();
+
+  public:
+    void accels_add(Accel* accel);
+    void accels_init();
+
+  public:
+    static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+  public:
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    void accels_print(size_t ident);
+    void accels_immutable();
+    void accels_build ();
+    void accels_select(bool filter);
+    void accels_deleteGeometry(size_t geomID);
+    void accels_clear ();
+
+  public:
+    std::vector<Accel*> accels;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..79be1c4301
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+  AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) 
+    : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+  AccelSet::IntersectorN::IntersectorN (ErrorFunc error) 
+    : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+  
+  AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+    : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h
new file mode 100644
index 0000000000..3774b2accb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+  struct IntersectFunctionNArguments;
+  struct OccludedFunctionNArguments;
+  
+  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  
+  struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportIntersectionFunc report;
+  };
+
+  struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportOcclusionFunc report;
+  };
+
+  /*! Base class for set of acceleration structures. */
+  class AccelSet : public Geometry
+  {
+  public:
+    typedef RTCIntersectFunctionN IntersectFuncN;  
+    typedef RTCOccludedFunctionN OccludedFuncN;
+    typedef void (*ErrorFunc) ();
+
+      struct IntersectorN
+      {
+        IntersectorN (ErrorFunc error = nullptr) ;
+        IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+        
+        operator bool() const { return name; }
+        
+      public:
+        static const char* type;
+        IntersectFuncN intersect;
+        OccludedFuncN occluded; 
+        const char* name;
+      };
+      
+    public:
+      
+      /*! construction */
+      AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+      
+      /*! makes the acceleration structure immutable */
+      virtual void immutable () {}
+      
+      /*! build accel */
+      virtual void build () = 0;
+
+      /*! check if the i'th primitive is valid between the specified time range */
+      __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+      {
+        for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+          if (!isvalid_non_empty(bounds(i,itime))) return false;
+        
+        return true;
+      }
+
+      /*! Calculates the bounds of an item */
+      __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+      {
+        BBox3fa box;
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)itime;
+        args.bounds_o = (RTCBounds*)&box;
+        boundsFunc(&args);
+        return box;
+      }
+
+      /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+      __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+      {
+        BBox3fa box[2];
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)(itime+0);
+        args.bounds_o = (RTCBounds*)&box[0];
+        boundsFunc(&args);
+        args.timeStep = (unsigned int)(itime+1);
+        args.bounds_o = (RTCBounds*)&box[1];
+        boundsFunc(&args);
+        return LBBox3fa(box[0],box[1]);
+      }
+
+      /*! calculates the build bounds of the i'th item, if it's valid */
+      __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+      {
+        const BBox3fa b = bounds(i);
+        if (bbox) *bbox = b;
+        return isvalid_non_empty(b);
+      }
+
+      /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+      __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+      {
+        const LBBox3fa bounds = linearBounds(i,itime);
+        bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+        return isvalid_non_empty(bounds);
+      }
+
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+        return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+      }
+      
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const  {
+        if (!valid(i, timeSegmentRange(time_range))) return false;
+        bbox = linearBounds(i, time_range);
+        return true;
+      }
+
+      /* gets version info of topology */
+      unsigned int getTopologyVersion() const {
+        return numPrimitives;
+      }
+    
+      /* returns true if topology changed */
+      bool topologyChanged(unsigned int otherVersion) const {
+        return numPrimitives != otherVersion;
+      }
+
+  public:
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+   
+      /*! Intersects a packet of K rays with the scene. */
+      template<int K>
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        vint<K> mask = valid.mask32();
+        IntersectFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+         
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        vint<K> mask = valid.mask32();
+        OccludedFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+
+    public:
+      RTCBoundsFunction boundsFunc;
+      IntersectorN intersectorN;
+  };
+  
+#define DEFINE_SET_INTERSECTORN(symbol,intersector)                     \
+  AccelSet::IntersectorN symbol() {                                     \
+    return AccelSet::IntersectorN(intersector::intersect, \
+                                  intersector::occluded, \
+                                  TOSTRING(isa) "::" TOSTRING(symbol)); \
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..6fa406f03a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
@@ -0,0 +1,82 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include "../../common/sys/barrier.h"
+#endif
+
+namespace embree
+{
+  __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+   
+  struct fast_allocator_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> numFailed;
+    std::unique_ptr<FastAllocator> alloc;
+
+    fast_allocator_regression_test() 
+      : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(fast_allocator_regression_test* This)
+    {
+      FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+      size_t* ptrs[1000];
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        for (size_t i=0; i<1000; i++) {
+          ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+          *ptrs[i] = size_t(threadalloc.talloc0) + i;
+        }
+        for (size_t i=0; i<1000; i++) {
+          if (*ptrs[i] != size_t(threadalloc.talloc0) + i) 
+            This->numFailed++;
+        }
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      alloc = make_unique(new FastAllocator(nullptr,false));
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        alloc->reset();
+        barrier.wait();
+        barrier.wait();
+      }
+     
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      alloc = nullptr;
+
+      return numFailed == 0;
+    }
+  };
+
+  fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.h b/thirdparty/embree-aarch64/kernels/common/alloc.h
new file mode 100644
index 0000000000..488fa707ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.h
@@ -0,0 +1,1006 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+namespace embree
+{
+  class FastAllocator
+  {
+    /*! maximum supported alignment */
+    static const size_t maxAlignment = 64;
+
+    /*! maximum allocation size */
+
+    /* default settings */
+    //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+  public:
+
+    struct ThreadLocal2;
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
+
+    /*! Per thread structure holding the current memory block. */
+    struct __aligned(64) ThreadLocal
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      /*! Constructor for usage with ThreadLocalData */
+      __forceinline ThreadLocal (ThreadLocal2* parent) 
+	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+      /*! initialize allocator */
+      void init(FastAllocator* alloc) 
+      {
+        ptr = nullptr;
+	cur = end = 0;
+        bytesUsed = 0;
+        bytesWasted = 0;
+        allocBlockSize = 0;
+        if (alloc) allocBlockSize = alloc->defaultBlockSize;
+      }
+
+      /* Allocate aligned memory from the threads memory block. */
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      {
+        /* bind the thread local allocator to the proper FastAllocator*/
+        parent->bind(alloc);
+
+        assert(align <= maxAlignment);
+	bytesUsed += bytes;
+
+        /* try to allocate in local block */
+	size_t ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+        
+        /* if allocation is too large allocate with parent allocator */
+        if (4*bytes > allocBlockSize) {
+          return alloc->malloc(bytes,maxAlignment,false);
+	}
+
+        /* get new partial block if allocation failed */
+        size_t blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ 	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* get new full block if allocation failed */
+        blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* should never happen as large allocations get handled specially above */
+        assert(false);
+        return nullptr;
+      }
+
+      
+      /*! returns amount of used bytes */
+      __forceinline size_t getUsedBytes() const { return bytesUsed; }
+  
+      /*! returns amount of free bytes */
+      __forceinline size_t getFreeBytes() const { return end-cur; }
+      
+      /*! returns amount of wasted bytes */
+      __forceinline size_t getWastedBytes() const { return bytesWasted; }
+  
+    private:
+      ThreadLocal2* parent;
+      char*  ptr;            //!< pointer to memory block
+      size_t cur;            //!< current location of the allocator
+      size_t end;            //!< end of the memory block
+      size_t allocBlockSize; //!< block size for allocations
+      size_t bytesUsed;      //!< number of total bytes allocated
+      size_t bytesWasted;    //!< number of bytes wasted
+    };
+
+    /*! Two thread local structures. */
+    struct __aligned(64) ThreadLocal2
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      __forceinline ThreadLocal2()
+        : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+      /*! bind to fast allocator */
+      __forceinline void bind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() == alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+        if (alloc.load()) {
+          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        }
+        alloc0.init(alloc_i);
+        alloc1.init(alloc_i);
+        alloc.store(alloc_i);
+        alloc_i->join(this);
+      }
+
+      /*! unbind to fast allocator */
+      void unbind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() != alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        alloc0.init(nullptr);
+        alloc1.init(nullptr);
+        alloc.store(nullptr);
+      }
+
+    public:
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::mutex mutex;
+#else
+      SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
+      std::atomic<FastAllocator*> alloc;  //!< parent allocator
+      ThreadLocal alloc0;
+      ThreadLocal alloc1;
+    };
+
+    FastAllocator (Device* device, bool osAllocation) 
+      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
+        primrefarray(device,0)
+    {
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+        assert(!slotMutex[i].isLocked());
+      }
+    }
+
+    ~FastAllocator () {
+      clear();
+    }
+
+    /*! returns the device attached to this allocator */
+    Device* getDevice() {
+      return device;
+    }
+
+    void share(mvector<PrimRef>& primrefarray_i) {
+      primrefarray = std::move(primrefarray_i);
+    }
+
+    void unshare(mvector<PrimRef>& primrefarray_o)
+    {
+      reset(); // this removes blocks that are allocated inside the shared primref array
+      primrefarray_o = std::move(primrefarray);
+    }
+
+    /*! returns first fast thread local allocator */
+    __forceinline ThreadLocal* _threadLocal() {
+      return &threadLocal2()->alloc0;
+    }
+
+    void setOSallocation(bool flag)
+    {
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
+    }
+
+  private:
+
+    /*! returns both fast thread local allocators */
+    __forceinline ThreadLocal2* threadLocal2() 
+    {
+      ThreadLocal2* alloc = thread_local_allocator2;
+      if (alloc == nullptr) {
+        thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+        Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
+        s_thread_local_allocators.push_back(make_unique(alloc));
+      }
+      return alloc;
+    }
+
+  public:
+
+    __forceinline void join(ThreadLocal2* alloc)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+      Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
+      thread_local_allocators.push_back(alloc);
+    }
+
+  public:
+
+    struct CachedAllocator
+    {
+      __forceinline CachedAllocator(void* ptr)
+        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
+      {
+        assert(ptr == nullptr);
+      }
+
+      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+      __forceinline operator bool () const {
+        return alloc != nullptr;
+      }
+
+      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+        return talloc1->malloc(alloc,bytes,align);
+      }
+
+    public:
+      FastAllocator* alloc;
+      ThreadLocal* talloc0;
+      ThreadLocal* talloc1;
+    };
+
+    __forceinline CachedAllocator getCachedAllocator() {
+      return CachedAllocator(this,threadLocal2());
+    }
+
+    /*! Builder interface to create thread local allocator */
+    struct Create
+    {
+    public:
+      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
+
+    private:
+      FastAllocator* allocator;
+    };
+
+    void internal_fix_used_blocks()
+    {
+      /* move thread local blocks to global block list */
+      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        while (threadBlocks[i].load() != nullptr) {
+          Block* nextUsedBlock = threadBlocks[i].load()->next;
+          threadBlocks[i].load()->next = usedBlocks.load();
+          usedBlocks = threadBlocks[i].load();
+          threadBlocks[i] = nextUsedBlock;
+        }
+        threadBlocks[i] = nullptr;
+      }
+    }
+
+    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+#if defined(__AVX512ER__) // KNL
+    static const size_t mainAllocOverheadStatic  = 15;  //! 15 means 7.5% allocation overhead through unfilled main alloc blocks
+#else
+    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
+#endif
+    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+    /* calculates a single threaded threshold for the builders such
+     * that for small scenes the overhead of partly allocated blocks
+     * per thread is low */
+    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+    {
+      if (numPrimitives == 0 || bytesEstimated == 0) 
+        return defaultThreshold;
+
+      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+      /* if we do not have to limit number of threads use optimal thresdhold */
+      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        return defaultThreshold;
+
+      /* otherwise limit number of threads by calculating proper single thread threshold */
+      else {
+        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
+      }
+    }
+
+    __forceinline size_t alignSize(size_t i) {
+      return (i+127)/128*128;
+    }
+
+    /*! initializes the grow size */
+    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
+    {
+      /* we do not need single thread local allocator mode */
+      use_single_mode = false;
+     
+      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+      /* if we reached the maxAllocationSize for growSize, we can
+       * increase the number of allocation slots by still guaranteeing
+       * the mainAllocationOverhead */
+      slotMask = 0x0;
+
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+      /* set the thread local alloc block size */
+      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+      
+      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+        else
+#endif
+        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+      if (bytesEstimated == 0) {
+        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+        defaultBlockSize = defaultBlockSizeSwitch;
+      }
+      log2_grow_size_scale = 0;
+      
+      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+    }
+
+    /*! initializes the allocator */
+    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      /* distribute the allocation to multiple thread block slots */
+      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      if (bytesReserve == 0) bytesReserve = bytesAllocate;
+      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      estimatedSize = bytesEstimate;
+      initGrowSizeAndNumSlots(bytesEstimate,true);
+    }
+
+    /*! initializes the allocator */
+    void init_estimate(size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      /* single allocator mode ? */
+      estimatedSize = bytesEstimate;
+      //initGrowSizeAndNumSlots(bytesEstimate,false);
+      initGrowSizeAndNumSlots(bytesEstimate,false);
+
+    }
+
+    /*! frees state not required after build */
+    __forceinline void cleanup()
+    {
+      internal_fix_used_blocks();
+
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! resets the allocator, memory blocks get reused */
+    void reset ()
+    {
+      internal_fix_used_blocks();
+
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+
+      /* reset all used blocks and move them to begin of free block list */
+      while (usedBlocks.load() != nullptr) {
+        usedBlocks.load()->reset_block();
+        Block* nextUsedBlock = usedBlocks.load()->next;
+        usedBlocks.load()->next = freeBlocks.load();
+        freeBlocks = usedBlocks.load();
+        usedBlocks = nextUsedBlock;
+      }
+
+      /* remove all shared blocks as they are re-added during build */
+      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! frees all allocated memory */
+    __forceinline void clear()
+    {
+      cleanup();
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      primrefarray.clear();
+    }
+
+    __forceinline size_t incGrowSizeScale()
+    {
+      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+      return size_t(1) << min(size_t(16),scale);
+    }
+
+    /*! thread safe allocation of memory */
+    void* malloc(size_t& bytes, size_t align, bool partial)
+    {
+      assert(align <= maxAlignment);
+
+      while (true)
+      {
+        /* allocate using current block */
+        size_t threadID = TaskScheduler::threadID();
+        size_t slot = threadID & slotMask;
+	Block* myUsedBlocks = threadUsedBlocks[slot];
+        if (myUsedBlocks) {
+          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          if (ptr) return ptr;
+        }
+
+        /* throw error if allocation is too large */
+        if (bytes > maxAllocationSize)
+          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+        if (likely(freeBlocks.load() == nullptr))
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
+          Lock<SpinLock> lock(slotMutex[slot]);
+#endif
+          if (myUsedBlocks == threadUsedBlocks[slot]) {
+            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+            assert(allocSize >= bytes);
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+          }
+          continue;
+        }
+
+        /* if this fails allocate new block */
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
+	  if (myUsedBlocks == threadUsedBlocks[slot])
+	  {
+            if (freeBlocks.load() != nullptr) {
+	      Block* nextFreeBlock = freeBlocks.load()->next;
+	      freeBlocks.load()->next = usedBlocks;
+	      __memory_barrier();
+	      usedBlocks = freeBlocks.load();
+              threadUsedBlocks[slot] = freeBlocks.load();
+	      freeBlocks = nextFreeBlock;
+	    } else {
+              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+	    }
+          }
+        }
+      }
+    }
+
+    /*! add new block */
+    void addBlock(void* ptr, ssize_t bytes)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(mutex);
+#else
+      Lock<SpinLock> lock(mutex);
+#endif
+      const size_t sizeof_Header = offsetof(Block,data[0]);
+      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+      size_t ofs = (size_t) aptr - (size_t) ptr;
+      bytes -= ofs;
+      if (bytes < 4096) return; // ignore empty or very small blocks
+      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+    }
+
+    /* special allocation only used from morton builder only a single time for each build */
+    void* specialAlloc(size_t bytes)
+    {
+      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+      return freeBlocks.load()->ptr();
+    }
+
+    struct Statistics
+    {
+      Statistics ()
+      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+      : bytesUsed(0), bytesFree(0), bytesWasted(0)
+      {
+        Block* usedBlocks = alloc->usedBlocks.load();
+        Block* freeBlocks = alloc->freeBlocks.load();
+        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+      }
+
+      std::string str(size_t numPrimitives)
+      {
+        std::stringstream str;
+        str.setf(std::ios::fixed, std::ios::floatfield);
+        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+        return str.str();
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+      {
+        return Statistics(a.bytesUsed+b.bytesUsed,
+                          a.bytesFree+b.bytesFree,
+                          a.bytesWasted+b.bytesWasted);
+      }
+
+      size_t bytesAllocatedTotal() const {
+        return bytesUsed + bytesFree + bytesWasted;
+      }
+
+    public:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+    };
+
+    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+      return Statistics(this,atype,huge_pages);
+    }
+
+    size_t getUsedBytes() {
+      return bytesUsed;
+    }
+
+    size_t getWastedBytes() {
+      return bytesWasted;
+    }
+
+    struct AllStatistics
+    {
+      AllStatistics (FastAllocator* alloc)
+
+      : bytesUsed(alloc->bytesUsed),
+        bytesFree(alloc->bytesFree),
+        bytesWasted(alloc->bytesWasted),
+        stat_all(alloc,ANY_TYPE),
+        stat_malloc(alloc,ALIGNED_MALLOC),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
+        stat_shared(alloc,SHARED) {}
+
+      AllStatistics (size_t bytesUsed,
+                     size_t bytesFree,
+                     size_t bytesWasted,
+                     Statistics stat_all,
+                     Statistics stat_malloc,
+                     Statistics stat_4K,
+                     Statistics stat_2M,
+                     Statistics stat_shared)
+
+      : bytesUsed(bytesUsed),
+        bytesFree(bytesFree),
+        bytesWasted(bytesWasted),
+        stat_all(stat_all),
+        stat_malloc(stat_malloc),
+        stat_4K(stat_4K),
+        stat_2M(stat_2M),
+        stat_shared(stat_shared) {}
+
+      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+      {
+        return AllStatistics(a.bytesUsed+b.bytesUsed,
+                             a.bytesFree+b.bytesFree,
+                             a.bytesWasted+b.bytesWasted,
+                             a.stat_all + b.stat_all,
+                             a.stat_malloc + b.stat_malloc,
+                             a.stat_4K + b.stat_4K,
+                             a.stat_2M + b.stat_2M,
+                             a.stat_shared + b.stat_shared);
+      }
+
+      void print(size_t numPrimitives)
+      {
+        std::stringstream str0;
+        str0.setf(std::ios::fixed, std::ios::floatfield);
+        str0 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "                                                            " 
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+        std::cout << str0.str() << std::endl;
+      
+        std::stringstream str1;
+        str1.setf(std::ios::fixed, std::ios::floatfield);
+        str1 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
+             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+        std::cout << str1.str() << std::endl;
+     
+        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
+        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
+        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
+        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
+      }
+
+    private:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+      Statistics stat_all;
+      Statistics stat_malloc;
+      Statistics stat_4K;
+      Statistics stat_2M;
+      Statistics stat_shared;
+    };
+
+    void print_blocks()
+    {
+      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+      std::cout << "  used blocks = ";
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+
+      std::cout << "  free blocks = ";
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+    }
+
+  private:
+
+    struct Block
+    {
+      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      {
+        /* We avoid using os_malloc for small blocks as this could
+         * cause a risk of fragmenting the virtual address space and
+         * reach the limit of vm.max_map_count = 65k under Linux. */
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
+          atype = ALIGNED_MALLOC;
+
+        /* we need to additionally allocate some header */
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        bytesAllocate = sizeof_Header+bytesAllocate;
+        bytesReserve  = sizeof_Header+bytesReserve;
+
+        /* consume full 4k pages with using os_malloc */
+        if (atype == EMBREE_OS_MALLOC) {
+          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+        }
+
+        /* either use alignedMalloc or os_malloc */
+        void *ptr = nullptr;
+        if (atype == ALIGNED_MALLOC)
+        {
+          /* special handling for default block size */
+          if (bytesAllocate == (2*PAGE_SIZE_2M))
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+
+            /* give hint to transparently convert these pages to 2MB pages */
+            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+          else
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+        }
+        else if (atype == EMBREE_OS_MALLOC)
+        {
+          if (device) device->memoryMonitor(bytesAllocate,false);
+          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+        }
+        else
+          assert(false);
+
+        return NULL;
+      }
+
+      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+      {
+        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+      }
+
+      static Block* remove_shared_blocks(Block* head)
+      {
+        Block** prev_next = &head;
+        for (Block* block = head; block; block = block->next) {
+          if (block->atype == SHARED) *prev_next = block->next;
+          else                         prev_next = &block->next;
+        }
+        return head;
+      }
+
+      void clear_list(MemoryMonitorInterface* device)
+      {
+        Block* block = this;
+        while (block) {
+          Block* next = block->next;
+          block->clear_block(device);
+          block = next;
+        }
+      }
+
+      void clear_block (MemoryMonitorInterface* device)
+      {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+        if (atype == ALIGNED_MALLOC) {
+          alignedFree(this);
+          if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else if (atype == EMBREE_OS_MALLOC) {
+         size_t sizeof_This = sizeof_Header+reserveEnd;
+         os_free(this,sizeof_This,huge_pages);
+         if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else /* if (atype == SHARED) */ {
+        }
+      }
+
+      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+      {
+        size_t bytes = bytes_in;
+        assert(align <= maxAlignment);
+        bytes = (bytes+(align-1)) & ~(align-1);
+	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+        if (unlikely(i > reserveEnd)) return nullptr;
+        bytes_in = bytes = min(bytes,reserveEnd-i);
+        
+	if (i+bytes > allocEnd) {
+          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+        }
+	return &data[i];
+      }
+
+      void* ptr() {
+        return &data[cur];
+      }
+
+      void reset_block ()
+      {
+        allocEnd = max(allocEnd,(size_t)cur);
+        cur = 0;
+      }
+
+      size_t getBlockUsedBytes() const {
+        return min(size_t(cur),reserveEnd);
+      }
+
+      size_t getBlockFreeBytes() const {
+	return getBlockAllocatedBytes() - getBlockUsedBytes();
+      }
+
+      size_t getBlockAllocatedBytes() const {
+        return min(max(allocEnd,size_t(cur)),reserveEnd);
+      }
+
+      size_t getBlockWastedBytes() const {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        return sizeof_Header + wasted;
+      }
+
+      size_t getBlockReservedBytes() const {
+        return reserveEnd;
+      }
+  
+      bool hasType(AllocationType atype_i, bool huge_pages_i) const
+      {
+        if      (atype_i == ANY_TYPE ) return true;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else                           return atype_i == atype;
+      }
+
+      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockUsedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockFreeBytes();
+        }
+        return bytes;
+      }
+
+      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockWastedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockAllocatedBytes();
+        }
+        return bytes;
+      }
+
+      void print_list ()
+      {
+        for (const Block* block = this; block; block = block->next)
+          block->print_block();
+      }
+
+      void print_block() const
+      {
+        if (atype == ALIGNED_MALLOC) std::cout << "A";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
+        else if (atype == SHARED) std::cout << "S";
+        if (huge_pages) std::cout << "H";
+        size_t bytesUsed = getBlockUsedBytes();
+        size_t bytesFree = getBlockFreeBytes();
+        size_t bytesWasted = getBlockWastedBytes();
+        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+      }
+
+    public:
+      std::atomic<size_t> cur;        //!< current location of the allocator
+      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
+      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+      Block* next;               //!< pointer to next block in list
+      size_t wasted;             //!< amount of memory wasted through block alignment
+      AllocationType atype;      //!< allocation mode of the block
+      bool huge_pages;           //!< whether the block uses huge pages
+      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+      char data[1];              //!< here starts memory to use for allocations
+    };
+
+  private:
+    Device* device;
+    SpinLock mutex;
+    size_t slotMask;
+    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> usedBlocks;
+    std::atomic<Block*> freeBlocks;
+
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
+
+    bool use_single_mode;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+    std::atomic<size_t> bytesUsed;
+    std::atomic<size_t> bytesFree;
+    std::atomic<size_t> bytesWasted;
+    static __thread ThreadLocal2* thread_local_allocator2;
+    static SpinLock s_thread_local_allocators_lock;
+    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex thread_local_allocators_lock;
+#else
+    SpinLock thread_local_allocators_lock;
+#endif
+    std::vector<ThreadLocal2*> thread_local_allocators;
+    AllocationType atype;
+    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h
new file mode 100644
index 0000000000..02d319c59d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+  /*! Implements an API data buffer object. This class may or may not own the data. */
+  class Buffer : public RefCount
+  {
+  public:
+    /*! Buffer construction */
+    Buffer() 
+      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+    /*! Buffer construction */
+    Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+      : device(device), numBytes(numBytes_in)
+    {
+      device->refInc();
+      
+      if (ptr_in)
+      {
+        shared = true;
+        ptr = (char*)ptr_in;
+      }
+      else
+      {
+        shared = false;
+        alloc();
+      }
+    }
+    
+    /*! Buffer destruction */
+    ~Buffer() {
+      free();
+      device->refDec();
+    }
+    
+    /*! this class is not copyable */
+  private:
+    Buffer(const Buffer& other) DELETED; // do not implement
+    Buffer& operator =(const Buffer& other) DELETED; // do not implement
+    
+  public:
+    /* inits and allocates the buffer */
+    void create(Device* device_in, size_t numBytes_in)
+    {
+      init(device_in, numBytes_in);
+      alloc();
+    }
+    
+    /* inits the buffer */
+    void init(Device* device_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = nullptr;
+      numBytes = numBytes_in;
+      shared = false;
+    }
+
+    /*! sets shared buffer */
+    void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = (char*)ptr_in;
+      if (numBytes_in != (size_t)-1)
+        numBytes = numBytes_in;
+      shared = true;
+    }
+    
+    /*! allocated buffer */
+    void alloc()
+    {
+      if (device)
+        device->memoryMonitor(this->bytes(), false);
+      size_t b = (this->bytes()+15) & ssize_t(-16);
+      ptr = (char*)alignedMalloc(b,16);
+    }
+    
+    /*! frees the buffer */
+    void free()
+    {
+      if (shared) return;
+      alignedFree(ptr); 
+      if (device)
+        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      ptr = nullptr;
+    }
+    
+    /*! gets buffer pointer */
+    void* data()
+    {
+      /* report error if buffer is not existing */
+      if (!device)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+      
+      /* return buffer */
+      return ptr;
+    }
+
+    /*! returns pointer to first element */
+    __forceinline char* getPtr() const {
+      return ptr;
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return numBytes;
+    }
+    
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr; 
+    }
+
+  public:
+    Device* device;  //!< device to report memory usage to
+    char* ptr;       //!< pointer to buffer data
+    size_t numBytes; //!< number of bytes in the buffer
+    bool shared;     //!< set if memory is shared with application
+  };
+
+  /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+  class RawBufferView
+  {
+  public:
+    /*! Buffer construction */
+    RawBufferView()
+      : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+  public:
+    /*! sets the buffer view */
+    void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+    {
+      if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+      ptr_ofs = buffer_in->ptr + offset_in;
+      stride = stride_in;
+      num = num_in;
+      format = format_in;
+      modCounter++;
+      modified = true;
+      buffer = buffer_in;
+    }
+
+    /*! returns pointer to the first element */
+    __forceinline char* getPtr() const {
+      return ptr_ofs;
+    }
+
+    /*! returns pointer to the i'th element */
+    __forceinline char* getPtr(size_t i) const
+    {
+      assert(i<num);
+      return ptr_ofs + i*stride;
+    }
+
+    /*! returns the number of elements of the buffer */
+    __forceinline size_t size() const { 
+      return num; 
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return num*stride; 
+    }
+    
+    /*! returns the buffer stride */
+    __forceinline unsigned getStride() const
+    {
+      assert(stride <= unsigned(inf));
+      return unsigned(stride);
+    }
+
+    /*! return the buffer format */
+    __forceinline RTCFormat getFormat() const {
+      return format;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline void setModified() {
+      modCounter++;
+      modified = true;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline bool isModified(unsigned int otherModCounter) const {
+      return modCounter > otherModCounter;
+    }
+
+     /*! mark buffer as modified or unmodified */
+    __forceinline bool isLocalModified() const {
+      return modified;
+    }
+
+    /*! clear local modified flag */
+    __forceinline void clearLocalModified() {
+      modified = false;
+    }
+
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr_ofs; 
+    }
+
+    /*! checks padding to 16 byte check, fails hard */
+    __forceinline void checkPadding16() const
+    {
+      if (ptr_ofs && num)
+        volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+    }
+
+  public:
+    char* ptr_ofs;      //!< base pointer plus offset
+    size_t stride;      //!< stride of the buffer in bytes
+    size_t num;         //!< number of elements in the buffer
+    RTCFormat format;   //!< format of the buffer
+    unsigned int modCounter; //!< version ID of this buffer
+    bool modified;      //!< local modified data
+    int userData;       //!< special data
+    Ref<Buffer> buffer; //!< reference to the parent buffer
+  };
+
+  /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+  template<typename T>
+  class BufferView : public RawBufferView
+  {
+  public:
+    typedef T value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline       T& operator [](size_t i)       { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+    __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+  };
+
+  template<>
+  class BufferView<Vec3fa> : public RawBufferView
+  {
+  public:
+    typedef Vec3fa value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/builder.h b/thirdparty/embree-aarch64/kernels/common/builder.h
new file mode 100644
index 0000000000..d2a1cfe3ce
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+  /*! virtual interface for all hierarchy builders */
+  class Builder : public RefCount {
+  public:
+
+    static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    /*! initiates the hierarchy builder */
+    virtual void build() = 0;
+
+    /*! notifies the builder about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+
+    /*! clears internal builder state */
+    virtual void clear() = 0;
+  };
+
+  /*! virtual interface for progress monitor class */
+  struct BuildProgressMonitor {
+    virtual void operator() (size_t dn) const = 0;
+  };
+
+  /*! build the progress monitor interface from a closure */
+  template<typename Closure>
+    struct ProgressMonitorClosure : BuildProgressMonitor
+  {
+  public:
+    ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+    void operator() (size_t dn) const { closure(dn); }
+  private:
+    const Closure closure;
+  };
+  template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+    return ProgressMonitorClosure<Closure>(closure);
+  }
+
+  struct LineSegments;
+  struct TriangleMesh;
+  struct QuadMesh;
+  struct UserGeometry;
+
+  class Scene;
+
+  typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+  typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h
new file mode 100644
index 0000000000..d0185a74f2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+  class Scene;
+
+  struct IntersectContext
+  {
+  public:
+    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+      : scene(scene), user(user_context) {}
+
+    __forceinline bool hasContextFilter() const {
+      return user->filter != nullptr;
+    }
+
+    __forceinline bool isCoherent() const {
+      return embree::isCoherent(user->flags);
+    }
+
+    __forceinline bool isIncoherent() const {
+      return embree::isIncoherent(user->flags);
+    }
+    
+  public:
+    Scene* scene;
+    RTCIntersectContext* user;
+  };
+
+  template<int M, typename Geometry>
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+    {
+#if RTC_MIN_WIDTH
+      const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+      return v;
+#endif
+    }
+
+    template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+  {
+#if RTC_MIN_WIDTH
+    const float d = length(Vec3fa(v) - ray_org);
+    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    return Vec3ff(v.x,v.y,v.z,r);
+#else
+    return v;
+#endif
+  }
+  
+  enum PointQueryType
+  {
+    POINT_QUERY_TYPE_UNDEFINED = 0,
+    POINT_QUERY_TYPE_SPHERE = 1,
+    POINT_QUERY_TYPE_AABB = 2,
+  };
+
+  typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+  struct PointQueryContext
+  {
+  public:
+    __forceinline PointQueryContext(Scene* scene, 
+                                    PointQuery* query_ws, 
+                                    PointQueryType query_type,
+                                    PointQueryFunction func, 
+                                    RTCPointQueryContext* userContext,
+                                    float similarityScale,
+                                    void* userPtr)
+      : scene(scene)
+      , query_ws(query_ws)
+      , query_type(query_type)
+      , func(func)
+      , userContext(userContext)
+      , similarityScale(similarityScale)
+      , userPtr(userPtr) 
+      , primID(RTC_INVALID_GEOMETRY_ID)
+      , geomID(RTC_INVALID_GEOMETRY_ID)
+      , query_radius(query_ws->radius)
+    { 
+      if (query_type == POINT_QUERY_TYPE_AABB) {
+        assert(similarityScale == 0.f);
+        updateAABB();
+      }
+      if (userContext->instStackSize == 0) {
+        assert(similarityScale == 1.f);
+      }
+    }
+
+  public:
+    __forceinline void updateAABB() 
+    {
+      if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+        query_radius = Vec3fa(query_ws->radius);
+        return;
+      }
+
+      const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+      BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+      bbox = xfmBounds(m, bbox);
+      query_radius = 0.5f * (bbox.upper - bbox.lower);
+    }
+
+public:
+    Scene* scene;
+
+    PointQuery* query_ws; // the original world space point query 
+    PointQueryType query_type;
+    PointQueryFunction func;
+    RTCPointQueryContext* userContext;
+    const float similarityScale;
+
+    void* userPtr;
+
+    unsigned int primID;
+    unsigned int geomID;
+
+    Vec3fa query_radius;  // used if the query is converted to an AABB internally
+  };
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h
new file mode 100644
index 0000000000..709119163b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/default.h
@@ -0,0 +1,273 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+#if !defined(_DEBUG) && defined(BUILD_IOS)
+#undef assert
+#define assert(_EXPR)
+#endif
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec2 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec2vf  = Vec2<vfloat<N>>;
+  template<int N> using Vec2vd  = Vec2<vdouble<N>>;
+  template<int N> using Vec2vr  = Vec2<vreal<N>>;
+  template<int N> using Vec2vi  = Vec2<vint<N>>;
+  template<int N> using Vec2vl  = Vec2<vllong<N>>;
+  template<int N> using Vec2vb  = Vec2<vbool<N>>;
+  template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+  template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+  typedef Vec2<vfloat4>  Vec2vf4;
+  typedef Vec2<vdouble4> Vec2vd4;
+  typedef Vec2<vreal4>   Vec2vr4;
+  typedef Vec2<vint4>    Vec2vi4;
+  typedef Vec2<vllong4>  Vec2vl4;
+  typedef Vec2<vbool4>   Vec2vb4;
+  typedef Vec2<vboolf4>  Vec2vbf4;
+  typedef Vec2<vboold4>  Vec2vbd4;
+
+  typedef Vec2<vfloat8>  Vec2vf8;
+  typedef Vec2<vdouble8> Vec2vd8;
+  typedef Vec2<vreal8>   Vec2vr8;
+  typedef Vec2<vint8>    Vec2vi8;
+  typedef Vec2<vllong8>  Vec2vl8;
+  typedef Vec2<vbool8>   Vec2vb8;
+  typedef Vec2<vboolf8>  Vec2vbf8;
+  typedef Vec2<vboold8>  Vec2vbd8;
+
+  typedef Vec2<vfloat16>  Vec2vf16;
+  typedef Vec2<vdouble16> Vec2vd16;
+  typedef Vec2<vreal16>   Vec2vr16;
+  typedef Vec2<vint16>    Vec2vi16;
+  typedef Vec2<vllong16>  Vec2vl16;
+  typedef Vec2<vbool16>   Vec2vb16;
+  typedef Vec2<vboolf16>  Vec2vbf16;
+  typedef Vec2<vboold16>  Vec2vbd16;
+
+  typedef Vec2<vfloatx>  Vec2vfx;
+  typedef Vec2<vdoublex> Vec2vdx;
+  typedef Vec2<vrealx>   Vec2vrx;
+  typedef Vec2<vintx>    Vec2vix;
+  typedef Vec2<vllongx>  Vec2vlx;
+  typedef Vec2<vboolx>   Vec2vbx;
+  typedef Vec2<vboolfx>  Vec2vbfx;
+  typedef Vec2<vbooldx>  Vec2vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec3 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec3vf  = Vec3<vfloat<N>>;
+  template<int N> using Vec3vd  = Vec3<vdouble<N>>;
+  template<int N> using Vec3vr  = Vec3<vreal<N>>;
+  template<int N> using Vec3vi  = Vec3<vint<N>>;
+  template<int N> using Vec3vl  = Vec3<vllong<N>>;
+  template<int N> using Vec3vb  = Vec3<vbool<N>>;
+  template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+  template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+  typedef Vec3<vfloat4>  Vec3vf4;
+  typedef Vec3<vdouble4> Vec3vd4;
+  typedef Vec3<vreal4>   Vec3vr4;
+  typedef Vec3<vint4>    Vec3vi4;
+  typedef Vec3<vllong4>  Vec3vl4;
+  typedef Vec3<vbool4>   Vec3vb4;
+  typedef Vec3<vboolf4>  Vec3vbf4;
+  typedef Vec3<vboold4>  Vec3vbd4;
+
+  typedef Vec3<vfloat8>  Vec3vf8;
+  typedef Vec3<vdouble8> Vec3vd8;
+  typedef Vec3<vreal8>   Vec3vr8;
+  typedef Vec3<vint8>    Vec3vi8;
+  typedef Vec3<vllong8>  Vec3vl8;
+  typedef Vec3<vbool8>   Vec3vb8;
+  typedef Vec3<vboolf8>  Vec3vbf8;
+  typedef Vec3<vboold8>  Vec3vbd8;
+
+  typedef Vec3<vfloat16>  Vec3vf16;
+  typedef Vec3<vdouble16> Vec3vd16;
+  typedef Vec3<vreal16>   Vec3vr16;
+  typedef Vec3<vint16>    Vec3vi16;
+  typedef Vec3<vllong16>  Vec3vl16;
+  typedef Vec3<vbool16>   Vec3vb16;
+  typedef Vec3<vboolf16>  Vec3vbf16;
+  typedef Vec3<vboold16>  Vec3vbd16;
+
+  typedef Vec3<vfloatx>  Vec3vfx;
+  typedef Vec3<vdoublex> Vec3vdx;
+  typedef Vec3<vrealx>   Vec3vrx;
+  typedef Vec3<vintx>    Vec3vix;
+  typedef Vec3<vllongx>  Vec3vlx;
+  typedef Vec3<vboolx>   Vec3vbx;
+  typedef Vec3<vboolfx>  Vec3vbfx;
+  typedef Vec3<vbooldx>  Vec3vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec4 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec4vf  = Vec4<vfloat<N>>;
+  template<int N> using Vec4vd  = Vec4<vdouble<N>>;
+  template<int N> using Vec4vr  = Vec4<vreal<N>>;
+  template<int N> using Vec4vi  = Vec4<vint<N>>;
+  template<int N> using Vec4vl  = Vec4<vllong<N>>;
+  template<int N> using Vec4vb  = Vec4<vbool<N>>;
+  template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+  template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+  typedef Vec4<vfloat4>  Vec4vf4;
+  typedef Vec4<vdouble4> Vec4vd4;
+  typedef Vec4<vreal4>   Vec4vr4;
+  typedef Vec4<vint4>    Vec4vi4;
+  typedef Vec4<vllong4>  Vec4vl4;
+  typedef Vec4<vbool4>   Vec4vb4;
+  typedef Vec4<vboolf4>  Vec4vbf4;
+  typedef Vec4<vboold4>  Vec4vbd4;
+
+  typedef Vec4<vfloat8>  Vec4vf8;
+  typedef Vec4<vdouble8> Vec4vd8;
+  typedef Vec4<vreal8>   Vec4vr8;
+  typedef Vec4<vint8>    Vec4vi8;
+  typedef Vec4<vllong8>  Vec4vl8;
+  typedef Vec4<vbool8>   Vec4vb8;
+  typedef Vec4<vboolf8>  Vec4vbf8;
+  typedef Vec4<vboold8>  Vec4vbd8;
+
+  typedef Vec4<vfloat16>  Vec4vf16;
+  typedef Vec4<vdouble16> Vec4vd16;
+  typedef Vec4<vreal16>   Vec4vr16;
+  typedef Vec4<vint16>    Vec4vi16;
+  typedef Vec4<vllong16>  Vec4vl16;
+  typedef Vec4<vbool16>   Vec4vb16;
+  typedef Vec4<vboolf16>  Vec4vbf16;
+  typedef Vec4<vboold16>  Vec4vbd16;
+
+  typedef Vec4<vfloatx>  Vec4vfx;
+  typedef Vec4<vdoublex> Vec4vdx;
+  typedef Vec4<vrealx>   Vec4vrx;
+  typedef Vec4<vintx>    Vec4vix;
+  typedef Vec4<vllongx>  Vec4vlx;
+  typedef Vec4<vboolx>   Vec4vbx;
+  typedef Vec4<vboolfx>  Vec4vbfx;
+  typedef Vec4<vbooldx>  Vec4vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Other shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+  typedef BBox<Vec3vf4>  BBox3vf4;
+  typedef BBox<Vec3vf8>  BBox3vf8;
+  typedef BBox<Vec3vf16> BBox3vf16;
+
+  /* calculate time segment itime and fractional time ftime */
+  __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = time * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  template<int N>
+  __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = time * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  template<int N>
+    __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+  {
+    const float round_up   = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+    const float round_down = 1.0f-2.0f*float(ulp);
+    const int itime_lower = (int)max(floor(round_up  *time_range.lower*numTimeSegments), 0.0f);
+    const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+    return make_range(itime_lower, itime_upper);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+  {
+    const float lower = (range.lower-time_range.lower)/time_range.size();
+    const float upper = (range.upper-time_range.lower)/time_range.size();
+    return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp
new file mode 100644
index 0000000000..16ec11b892
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.cpp
@@ -0,0 +1,567 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+  /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+  ssize_t Device::debug_int0 = 0;
+  ssize_t Device::debug_int1 = 0;
+  ssize_t Device::debug_int2 = 0;
+  ssize_t Device::debug_int3 = 0;
+
+  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+  static MutexSys g_mutex;
+  static std::map<Device*,size_t> g_cache_size_map;
+  static std::map<Device*,size_t> g_num_threads_map;
+
+  Device::Device (const char* cfg)
+  {
+    /* check that CPU supports lowest ISA */
+    if (!hasISA(ISA)) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+    }
+
+    /* set default frequency level for detected CPU */
+    switch (getCPUModel()) {
+    case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::SANDY_BRIDGE:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::NEHALEM:         frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE2:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
+    }
+
+    /* initialize global state */
+#if defined(EMBREE_CONFIG)
+    State::parseString(EMBREE_CONFIG);
+#endif
+    State::parseString(cfg);
+    if (!ignore_config_files && FileName::executableFolder() != FileName(""))
+      State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    if (!ignore_config_files && FileName::homeFolder() != FileName(""))
+      State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    State::verify();
+
+    /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */    
+    if (!checkISASupport()) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+    }    
+    
+    /*! do some internal tests */
+    assert(isa::Cylinder::verify());
+
+    /*! enable huge page support if desired */
+#if defined(__WIN32__)
+    if (State::enable_selockmemoryprivilege)
+      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+    
+    /*! set tessellation cache size */
+    setCacheSize( State::tessellation_cache_size );
+
+    /*! enable some floating point exceptions to catch bugs */
+    if (State::float_exceptions)
+    {
+      int exceptions = _MM_MASK_MASK;
+      //exceptions &= ~_MM_MASK_INVALID;
+      exceptions &= ~_MM_MASK_DENORM;
+      exceptions &= ~_MM_MASK_DIV_ZERO;
+      //exceptions &= ~_MM_MASK_OVERFLOW;
+      //exceptions &= ~_MM_MASK_UNDERFLOW;
+      //exceptions &= ~_MM_MASK_INEXACT;
+      _MM_SET_EXCEPTION_MASK(exceptions);
+    }
+    
+    /* print info header */
+    if (State::verbosity(1))
+      print();
+    if (State::verbosity(2)) 
+      State::print();
+
+    /* register all algorithms */
+    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+    /* setup tasking system */
+    initTaskingSystem(numThreads);
+
+    /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+    RayStreamFilterFuncsType rayStreamFilterFuncs;
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs);
+    rayStreamFilters = rayStreamFilterFuncs();
+#endif
+  }
+
+  Device::~Device ()
+  {
+    setCacheSize(0);
+    exitTaskingSystem();
+  }
+
+  std::string getEnabledTargets()
+  {
+    std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+    v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512KNL)
+    v += "AVX512KNL ";
+#endif
+#if defined(EMBREE_TARGET_AVX512SKX)
+    v += "AVX512SKX ";
+#endif
+    return v;
+  }
+
+  std::string getEmbreeFeatures()
+  {
+    std::string v;
+#if defined(EMBREE_RAY_MASK)
+    v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+    v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+    v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+    v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+    v += "compact_polys ";
+#endif
+    return v;
+  }
+
+  void Device::print()
+  {
+    const int cpu_features = getCPUFeatures();
+    std::cout << std::endl;
+    std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+    std::cout << "  Compiler  : " << getCompilerName() << std::endl;
+    std::cout << "  Build     : ";
+#if defined(DEBUG)
+    std::cout << "Debug " << std::endl;
+#else
+    std::cout << "Release " << std::endl;
+#endif
+    std::cout << "  Platform  : " << getPlatformName() << std::endl;
+    std::cout << "  CPU       : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+    std::cout << "   Threads  : " << getNumberOfLogicalThreads() << std::endl;
+    std::cout << "   ISA      : " << stringOfCPUFeatures(cpu_features) << std::endl;
+    std::cout << "   Targets  : " << supportedTargetList(cpu_features) << std::endl;
+    const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+    const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+    std::cout << "   MXCSR    : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+    std::cout << "  Config" << std::endl;
+    std::cout << "    Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+    std::cout << "    ISA     : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+    std::cout << "    Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+    std::cout << "              " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+    std::cout << "    Features: " << getEmbreeFeatures() << std::endl;
+    std::cout << "    Tasking : ";
+#if defined(TASKING_TBB)
+    std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+  #if TBB_INTERFACE_VERSION >= 12002
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+  #else
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+  #endif
+#endif
+#if defined(TASKING_INTERNAL)
+    std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    std::cout << "GCD tasking system ";
+#endif
+#if defined(TASKING_PPL)
+	std::cout << "PPL ";
+#endif
+    std::cout << std::endl;
+
+    /* check of FTZ and DAZ flags are set in CSR */
+    if (!hasFTZ || !hasDAZ) 
+    {
+#if !defined(_DEBUG)
+      if (State::verbosity(1)) 
+#endif
+      {
+        std::cout << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << "  WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled "         << std::endl 
+                  << "           in the MXCSR control and status register. This can have a severe "     << std::endl
+                  << "           performance impact. Please enable these modes for each application "   << std::endl
+                  << "           thread the following way:" << std::endl
+                  << std::endl 
+                  << "           #include \"xmmintrin.h\"" << std::endl 
+                  << "           #include \"pmmintrin.h\"" << std::endl 
+                  << std::endl 
+                  << "           _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl 
+                  << "           _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  void Device::setDeviceErrorCode(RTCError error)
+  {
+    RTCError* stored_error = errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getDeviceErrorCode()
+  {
+    RTCError* stored_error = errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::setThreadErrorCode(RTCError error)
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getThreadErrorCode()
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::process_error(Device* device, RTCError error, const char* str)
+  { 
+    /* store global error code when device construction failed */
+    if (!device)
+      return setThreadErrorCode(error);
+
+    /* print error when in verbose mode */
+    if (device->verbosity(1)) 
+    {
+      switch (error) {
+      case RTC_ERROR_NONE         : std::cerr << "Embree: No error"; break;
+      case RTC_ERROR_UNKNOWN    : std::cerr << "Embree: Unknown error"; break;
+      case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+      case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+      case RTC_ERROR_OUT_OF_MEMORY    : std::cerr << "Embree: Out of memory"; break;
+      case RTC_ERROR_UNSUPPORTED_CPU  : std::cerr << "Embree: Unsupported CPU"; break;
+      default                   : std::cerr << "Embree: Invalid error code"; break;                   
+      };
+      if (str) std::cerr << ", (" << str << ")";
+      std::cerr << std::endl;
+    }
+
+    /* call user specified error callback */
+    if (device->error_function) 
+      device->error_function(device->error_function_userptr,error,str); 
+
+    /* record error code */
+    device->setDeviceErrorCode(error);
+  }
+
+  void Device::memoryMonitor(ssize_t bytes, bool post)
+  {
+    if (State::memory_monitor_function && bytes != 0) {
+      if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+        if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+          throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+        }
+      }
+    }
+  }
+
+  size_t getMaxNumThreads()
+  {
+    size_t maxNumThreads = 0;
+    for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+      maxNumThreads = max(maxNumThreads, (*i).second);
+    if (maxNumThreads == 0)
+      maxNumThreads = std::numeric_limits<size_t>::max();
+    return maxNumThreads;
+  }
+
+  size_t getMaxCacheSize()
+  {
+    size_t maxCacheSize = 0;
+    for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+      maxCacheSize = max(maxCacheSize, (*i).second);
+    return maxCacheSize;
+  }
+ 
+  void Device::setCacheSize(size_t bytes) 
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    Lock<MutexSys> lock(g_mutex);
+    if (bytes == 0) g_cache_size_map.erase(this);
+    else            g_cache_size_map[this] = bytes;
+    
+    size_t maxCacheSize = getMaxCacheSize();
+    resizeTessellationCache(maxCacheSize);
+#endif
+  }
+
+  void Device::initTaskingSystem(size_t numThreads) 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    if (numThreads == 0) 
+      g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+    else 
+      g_num_threads_map[this] = numThreads;
+
+    /* create task scheduler */
+    size_t maxNumThreads = getMaxNumThreads();
+    TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+    const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+    const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+  }
+
+  void Device::exitTaskingSystem() 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    g_num_threads_map.erase(this);
+
+    /* terminate tasking system */
+    if (g_num_threads_map.size() == 0) {
+      TaskScheduler::destroy();
+    } 
+    /* or configure new number of threads */
+    else {
+      size_t maxNumThreads = getMaxNumThreads();
+      TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+    }
+#if USE_TASK_ARENA
+    arena.reset();
+#endif
+  }
+
+  void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+  {
+    /* hidden internal properties */
+    switch ((size_t)prop)
+    {
+    case 1000000: debug_int0 = val; return;
+    case 1000001: debug_int1 = val; return;
+    case 1000002: debug_int2 = val; return;
+    case 1000003: debug_int3 = val; return;
+    }
+
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+  }
+
+  ssize_t Device::getProperty(const RTCDeviceProperty prop)
+  {
+    size_t iprop = (size_t)prop;
+
+    /* get name of internal regression test */
+    if (iprop >= 2000000 && iprop < 3000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-2000000);
+      if (test) return (ssize_t) test->name.c_str();
+      else      return 0;
+    }
+
+    /* run internal regression test */
+    if (iprop >= 3000000 && iprop < 4000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-3000000);
+      if (test) return test->run();
+      else      return 0;
+    }
+
+    /* documented properties */
+    switch (prop) 
+    {
+    case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+    case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+    case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+    case RTC_DEVICE_PROPERTY_VERSION      : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return hasISA(SSE2);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return hasISA(AVX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
+#endif
+    
+#if defined(EMBREE_RAY_MASK)
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+            
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+        
+#if defined(EMBREE_GEOMETRY_QUAD)
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+    default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h
new file mode 100644
index 0000000000..e9a81bb109
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+  class BVH4Factory;
+  class BVH8Factory;
+
+  class Device : public State, public MemoryMonitorInterface
+  {
+    ALIGNED_CLASS_(16);
+
+  public:
+
+    /*! Device construction */
+    Device (const char* cfg);
+
+    /*! Device destruction */
+    virtual ~Device ();
+
+    /*! prints info about the device */
+    void print();
+
+    /*! sets the error code */
+    void setDeviceErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    RTCError getDeviceErrorCode();
+
+    /*! sets the error code */
+    static void setThreadErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    static RTCError getThreadErrorCode();
+
+    /*! processes error codes, do not call directly */
+    static void process_error(Device* device, RTCError error, const char* str);
+
+    /*! invokes the memory monitor callback */
+    void memoryMonitor(ssize_t bytes, bool post);
+
+    /*! sets the size of the software cache. */
+    void setCacheSize(size_t bytes);
+
+    /*! sets a property */
+    void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+    /*! gets a property */
+    ssize_t getProperty(const RTCDeviceProperty prop);
+
+  private:
+
+    /*! initializes the tasking system */
+    void initTaskingSystem(size_t numThreads);
+
+    /*! shuts down the tasking system */
+    void exitTaskingSystem();
+
+    /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+  public:
+    static ssize_t debug_int0;
+    static ssize_t debug_int1;
+    static ssize_t debug_int2;
+    static ssize_t debug_int3;
+
+  public:
+    std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+    std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+    
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+    
+    /* ray streams filter */
+    RayStreamFilterFuncs rayStreamFilters;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..b3aa8e3396
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+  const char* Geometry::gtype_names[Geometry::GTY_END] =
+  {
+    "flat_linear_curve",
+    "round_linear_curve",
+    "oriented_linear_curve",
+    "",
+    "flat_bezier_curve",
+    "round_bezier_curve",
+    "oriented_bezier_curve",
+    "",
+    "flat_bspline_curve",
+    "round_bspline_curve",
+    "oriented_bspline_curve",
+    "",
+    "flat_hermite_curve",
+    "round_hermite_curve",
+    "oriented_hermite_curve",
+    "",
+    "flat_catmull_rom_curve",
+    "round_catmull_rom_curve",
+    "oriented_catmull_rom_curve",
+    "",    
+    "triangles",
+    "quads",
+    "grid",
+    "subdivs",
+    "",
+    "sphere",
+    "disc",
+    "oriented_disc",
+    "",
+    "usergeom",
+    "instance_cheap",
+    "instance_expensive",
+  };
+     
+  Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
+    : device(device), userPtr(nullptr),
+      numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+      mask(-1),
+      gtype(gtype),
+      gsubtype(GTY_SUBTYPE_DEFAULT),
+      quality(RTC_BUILD_QUALITY_MEDIUM),
+      state((unsigned)State::MODIFIED),
+      enabled(true),
+      intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+  {
+    device->refInc();
+  }
+
+  Geometry::~Geometry()
+  {
+    device->refDec();
+  }
+
+  void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+  {      
+    if (numPrimitives_in == numPrimitives) return;
+    
+    numPrimitives = numPrimitives_in;
+    
+    Geometry::update();
+  }
+
+  void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+  {
+    if (numTimeSteps_in == numTimeSteps) {
+      return;
+    }
+    
+    numTimeSteps = numTimeSteps_in;
+    fnumTimeSegments = float(numTimeSteps_in-1);
+    
+    Geometry::update();
+  }
+
+  void Geometry::setTimeRange (const BBox1f range)
+  {
+    time_range = range;
+    Geometry::update();
+  }
+  
+  void Geometry::update()
+  {
+    ++modCounter_; // FIXME: required?
+    state = (unsigned)State::MODIFIED;
+  }
+  
+  void Geometry::commit() 
+  {
+    ++modCounter_;
+    state = (unsigned)State::COMMITTED;
+  }
+
+  void Geometry::preCommit()
+  {
+    if (State::MODIFIED == (State)state)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+  }
+
+  void Geometry::postCommit()
+  {
+  }
+
+  void Geometry::enable () 
+  {
+    if (isEnabled()) 
+      return;
+
+    enabled = true;
+    ++modCounter_;
+  }
+
+  void Geometry::disable () 
+  {
+    if (isDisabled()) 
+      return;
+    
+    enabled = false;
+    ++modCounter_;
+  }
+
+  void Geometry::setUserData (void* ptr)
+  {
+    userPtr = ptr;
+  }
+  
+  void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    intersectionFilterN = filter;
+  }
+
+  void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    occlusionFilterN = filter;
+  }
+  
+  void Geometry::setPointQueryFunction (RTCPointQueryFunction func) 
+  {
+    pointQueryFunc = func;
+  }
+
+  void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+  {
+    const void* valid_i = args->valid;
+    const unsigned* primIDs = args->primIDs;
+    const float* u = args->u;
+    const float* v = args->v;
+    unsigned int N = args->N;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+    const int* valid = (const int*) valid_i;
+ 
+    __aligned(64) float P_tmp[256];
+    __aligned(64) float dPdu_tmp[256];
+    __aligned(64) float dPdv_tmp[256];
+    __aligned(64) float ddPdudu_tmp[256];
+    __aligned(64) float ddPdvdv_tmp[256];
+    __aligned(64) float ddPdudv_tmp[256];
+
+    float* Pt = P ? P_tmp : nullptr;
+    float* dPdut = nullptr, *dPdvt = nullptr;
+    if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+    float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+    if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+    
+    for (unsigned int i=0; i<N; i++)
+    {
+      if (valid && !valid[i]) continue;
+
+      RTCInterpolateArguments iargs;
+      iargs.primID = primIDs[i];
+      iargs.u = u[i];
+      iargs.v = v[i];
+      iargs.bufferType = bufferType;
+      iargs.bufferSlot = bufferSlot;
+      iargs.P = Pt;
+      iargs.dPdu = dPdut;
+      iargs.dPdv = dPdvt;
+      iargs.ddPdudu = ddPdudut;
+      iargs.ddPdvdv = ddPdvdvt;
+      iargs.ddPdudv = ddPdudvt;
+      iargs.valueCount = valueCount;
+      interpolate(&iargs);
+      
+      if (likely(P)) {
+        for (unsigned int j=0; j<valueCount; j++) 
+          P[j*N+i] = Pt[j];
+      }
+      if (likely(dPdu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          dPdu[j*N+i] = dPdut[j];
+          dPdv[j*N+i] = dPdvt[j];
+        }
+      }
+      if (likely(ddPdudu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          ddPdudu[j*N+i] = ddPdudut[j];
+          ddPdvdv[j*N+i] = ddPdvdvt[j];
+          ddPdudv[j*N+i] = ddPdudvt[j];
+        }
+      }
+    }
+  }
+    
+  bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+  {
+    assert(context->primID < size());
+   
+    RTCPointQueryFunctionArguments args;
+    args.query           = (RTCPointQuery*)context->query_ws;
+    args.userPtr         = context->userPtr;
+    args.primID          = context->primID;
+    args.geomID          = context->geomID;
+    args.context         = context->userContext;
+    args.similarityScale = context->similarityScale;
+    
+    bool update = false;
+    if(context->func)  update |= context->func(&args);
+    if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+    if (update && context->userContext->instStackSize > 0)
+    {
+      // update point query
+      if (context->query_type == POINT_QUERY_TYPE_AABB) {
+        context->updateAABB();
+      } else {
+        assert(context->similarityScale > 0.f);
+        query->radius = context->query_ws->radius * context->similarityScale;
+      }
+    }
+    return update;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h
new file mode 100644
index 0000000000..953974bfd2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+  class Scene;
+  class Geometry;
+
+  struct GeometryCounts 
+  {
+    __forceinline GeometryCounts()
+      : numFilterFunctions(0),
+        numTriangles(0), numMBTriangles(0), 
+        numQuads(0), numMBQuads(0), 
+        numBezierCurves(0), numMBBezierCurves(0), 
+        numLineSegments(0), numMBLineSegments(0), 
+        numSubdivPatches(0), numMBSubdivPatches(0), 
+        numUserGeometries(0), numMBUserGeometries(0), 
+        numInstancesCheap(0), numMBInstancesCheap(0), 
+        numInstancesExpensive(0), numMBInstancesExpensive(0), 
+        numGrids(0), numMBGrids(0), 
+        numPoints(0), numMBPoints(0) {}
+
+    __forceinline size_t size() const {
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+    }
+
+    __forceinline unsigned int enabledGeometryTypesMask() const
+    {
+      unsigned int mask = 0;
+      if (numTriangles) mask |= 1 << 0;
+      if (numQuads) mask |= 1 << 1;
+      if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+      if (numSubdivPatches) mask |= 1 << 3;
+      if (numUserGeometries) mask |= 1 << 4;
+      if (numInstancesCheap) mask |= 1 << 5;
+      if (numInstancesExpensive) mask |= 1 << 6;
+      if (numGrids) mask |= 1 << 7;
+      if (numPoints) mask |= 1 << 8;
+
+      unsigned int maskMB = 0;
+      if (numMBTriangles) maskMB |= 1 << 0;
+      if (numMBQuads) maskMB |= 1 << 1;
+      if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+      if (numMBSubdivPatches) maskMB |= 1 << 3;
+      if (numMBUserGeometries) maskMB |= 1 << 4;
+      if (numMBInstancesCheap) maskMB |= 1 << 5;
+      if (numMBInstancesExpensive) maskMB |= 1 << 6;
+      if (numMBGrids) maskMB |= 1 << 7;
+      if (numMBPoints) maskMB |= 1 << 8;
+      
+      return (mask<<8) + maskMB;
+    }
+
+    __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+    {
+      GeometryCounts ret;
+      ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+      ret.numTriangles = numTriangles + rhs.numTriangles;
+      ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+      ret.numQuads = numQuads + rhs.numQuads;
+      ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+      ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+      ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+      ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+      ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+      ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+      ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+      ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+      ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+      ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+      ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+      ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+      ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numGrids = numGrids + rhs.numGrids;
+      ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numPoints = numPoints + rhs.numPoints;
+      ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+      return ret;
+    }
+
+    size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
+    size_t numTriangles;             //!< number of enabled triangles
+    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numQuads;                 //!< number of enabled quads
+    size_t numMBQuads;               //!< number of enabled motion blurred quads
+    size_t numBezierCurves;          //!< number of enabled curves
+    size_t numMBBezierCurves;        //!< number of enabled motion blurred curves
+    size_t numLineSegments;          //!< number of enabled line segments
+    size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
+    size_t numSubdivPatches;         //!< number of enabled subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numUserGeometries;        //!< number of enabled user geometries
+    size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
+    size_t numInstancesCheap;        //!< number of enabled cheap instances
+    size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
+    size_t numInstancesExpensive;    //!< number of enabled expensive instances
+    size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numGrids;                 //!< number of enabled grid geometries
+    size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numPoints;                //!< number of enabled points
+    size_t numMBPoints;              //!< number of enabled motion blurred points
+  };
+
+  /*! Base class all geometries are derived from */
+  class Geometry : public RefCount
+  {
+    friend class Scene;
+  public:
+
+    /*! type of geometry */
+    enum GType
+    {
+      GTY_FLAT_LINEAR_CURVE = 0,
+      GTY_ROUND_LINEAR_CURVE = 1,
+      GTY_ORIENTED_LINEAR_CURVE = 2,
+      GTY_CONE_LINEAR_CURVE = 3,
+      
+      GTY_FLAT_BEZIER_CURVE = 4,
+      GTY_ROUND_BEZIER_CURVE = 5,
+      GTY_ORIENTED_BEZIER_CURVE = 6,
+      
+      GTY_FLAT_BSPLINE_CURVE = 8,
+      GTY_ROUND_BSPLINE_CURVE = 9,
+      GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+      GTY_FLAT_HERMITE_CURVE = 12,
+      GTY_ROUND_HERMITE_CURVE = 13,
+      GTY_ORIENTED_HERMITE_CURVE = 14,
+      
+      GTY_FLAT_CATMULL_ROM_CURVE = 16,
+      GTY_ROUND_CATMULL_ROM_CURVE = 17,
+      GTY_ORIENTED_CATMULL_ROM_CURVE = 18,      
+
+      GTY_TRIANGLE_MESH = 20,
+      GTY_QUAD_MESH = 21,
+      GTY_GRID_MESH = 22,
+      GTY_SUBDIV_MESH = 23,
+
+      GTY_SPHERE_POINT = 25,
+      GTY_DISC_POINT = 26,
+      GTY_ORIENTED_DISC_POINT = 27,
+      
+      GTY_USER_GEOMETRY = 29,
+      GTY_INSTANCE_CHEAP = 30,
+      GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_END = 32,
+
+      GTY_BASIS_LINEAR = 0,
+      GTY_BASIS_BEZIER = 4,
+      GTY_BASIS_BSPLINE = 8,
+      GTY_BASIS_HERMITE = 12,
+      GTY_BASIS_CATMULL_ROM = 16,
+      GTY_BASIS_MASK = 28,
+
+      GTY_SUBTYPE_FLAT_CURVE = 0,
+      GTY_SUBTYPE_ROUND_CURVE = 1,
+      GTY_SUBTYPE_ORIENTED_CURVE = 2,
+      GTY_SUBTYPE_MASK = 3,
+    };
+
+    enum GSubType
+    {
+      GTY_SUBTYPE_DEFAULT= 0,
+      GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+      GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+    };
+
+    enum GTypeMask
+    {
+      MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+      MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+      MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+      MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+      MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+      MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+      
+      MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+      MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+      MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+      MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+      MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+      MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+      MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+      MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+      MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+                   MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+                   MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+                   MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+      MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+      MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+      MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+      MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+      MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+      MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+      MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+      MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+      MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+      MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+      MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+    };
+
+    static const char* gtype_names[GTY_END];
+
+    enum class State : unsigned {
+      MODIFIED = 0,
+      COMMITTED = 1,
+    };
+
+  public:
+    
+    /*! Geometry constructor */
+    Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+    /*! Geometry destructor */
+    virtual ~Geometry();
+
+  public:
+
+    /*! tests if geometry is enabled */
+    __forceinline bool isEnabled() const { return enabled; }
+
+    /*! tests if geometry is disabled */
+    __forceinline bool isDisabled() const { return !isEnabled(); }
+
+    /*! tests if that geometry has some filter function set */
+    __forceinline bool hasFilterFunctions () const {
+      return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
+    }
+
+    /*! returns geometry type */
+    __forceinline GType getType() const { return gtype; }
+
+    /*! returns curve type */
+    __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+    /*! returns curve basis */
+    __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+    /*! returns geometry type mask */
+    __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+    /*! returns number of primitives */
+    __forceinline size_t size() const { return numPrimitives; }
+
+    /*! sets the number of primitives */
+    virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+    /*! sets number of time steps */
+    virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+    /*! sets motion blur time range */
+    void setTimeRange (const BBox1f range);
+
+    /*! sets number of vertex attributes */
+    virtual void setVertexAttributeCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets number of topologies */
+    virtual void setTopologyCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets the build quality */
+    void setBuildQuality(RTCBuildQuality quality_in)
+    {
+      this->quality = quality_in;
+      Geometry::update();
+    }
+
+    /* calculate time segment itime and fractional time ftime */
+    __forceinline int timeSegment(float time, float& ftime) const {
+      return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+    }
+
+    template<int N>
+      __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+      return getTimeSegment(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+    }
+    
+    /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+    }
+
+    /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<(int)numTimeSteps);
+      return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+    }
+    
+    /*! for all geometries */
+  public:
+
+    /*! Enable geometry. */
+    virtual void enable();
+
+    /*! Update geometry. */
+    void update();
+    
+    /*! commit of geometry */
+    virtual void commit();
+
+    /*! Update geometry buffer. */
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+      update(); // update everything for geometries not supporting this call
+    }
+    
+    /*! Disable geometry. */
+    virtual void disable();
+
+    /*! Verify the geometry */
+    virtual bool verify() { return true; }
+
+    /*! called before every build */
+    virtual void preCommit();
+  
+    /*! called after every build */
+    virtual void postCommit();
+
+    virtual void addElementsToCount (GeometryCounts & counts) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    };
+
+    /*! sets constant tessellation rate for the geometry */
+    virtual void setTessellationRate(float N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets the maximal curve radius scale allowed by min-width feature. */
+    virtual void setMaxRadiusScale(float s) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set user data pointer. */
+    virtual void setUserData(void* ptr);
+      
+    /*! Get user data pointer. */
+    __forceinline void* getUserData() const {
+      return userPtr;
+    }
+
+    /*! interpolates user data to the specified u/v location */
+    virtual void interpolate(const RTCInterpolateArguments* const args) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! interpolates user data to the specified u/v locations */
+    virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+    /* point query api */
+    bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+    /*! for subdivision surfaces only */
+  public:
+    virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set displacement function. */
+    virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFace(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! get fast access to first vertex buffer if applicable */
+    virtual float * getCompactVertexArray () const {
+      return nullptr;
+    }
+
+    /*! Returns the modified counter - how many times the geo has been modified */
+    __forceinline unsigned int getModCounter () const {
+      return modCounter_;
+    }
+
+    /*! for triangle meshes and bezier curves only */
+  public:
+
+
+    /*! Sets ray mask. */
+    virtual void setMask(unsigned mask) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Sets specified buffer. */
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Gets specified buffer. */
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Set intersection filter function for ray packets of size N. */
+    virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! Set occlusion filter function for ray packets of size N. */
+    virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! for instances only */
+  public:
+
+    /*! Sets the instanced scene */
+    virtual void setInstancedScene(const Ref<Scene>& scene) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! for user geometries only */
+  public:
+
+    /*! Set bounds function. */
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set intersect function for ray packets of size N. */
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set occlusion function for ray packets of size N. */
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set point query function. */
+    void setPointQueryFunction(RTCPointQueryFunction func);
+
+    /*! returns number of time segments */
+    __forceinline unsigned numTimeSegments () const {
+      return numTimeSteps-1;
+    }
+
+  public:
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+    
+    virtual Vec3fa computeDirection(unsigned int primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+    
+    virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+    virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+  public:
+    __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+    __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+  public:
+    Device* device;             //!< device this geometry belongs to
+
+    void* userPtr;              //!< user pointer
+    unsigned int numPrimitives; //!< number of primitives of this geometry
+    
+    unsigned int numTimeSteps;  //!< number of time steps
+    float fnumTimeSegments;     //!< number of time segments (precalculation)
+    BBox1f time_range;          //!< motion blur time range
+    
+    unsigned int mask;             //!< for masking out geometry
+    unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+    
+    struct {
+      GType gtype : 8;                //!< geometry type
+      GSubType gsubtype : 8;          //!< geometry subtype
+      RTCBuildQuality quality : 3;    //!< build quality for geometry
+      unsigned state : 2;
+      bool enabled : 1;              //!< true if geometry is enabled
+    };
+       
+    RTCFilterFunctionN intersectionFilterN;
+    RTCFilterFunctionN occlusionFilterN;
+    RTCPointQueryFunction pointQueryFunc;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h
new file mode 100644
index 0000000000..32a198cdfe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+  /* Hit structure for K hits */
+  template<int K>
+    struct HitK
+  {
+    /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+      : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return K; }
+
+  public:
+    Vec3vf<K> Ng;  // geometry normal
+    vfloat<K> u;         // barycentric u coordinate of hit
+    vfloat<K> v;         // barycentric v coordinate of hit
+    vuint<K> primID;      // primitive ID
+    vuint<K> geomID;      // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Specialization for a single hit */
+  template<>
+    struct __aligned(16) HitK<1>
+  {
+     /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+      : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+    {
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return 1; }
+
+  public:
+    Vec3<float> Ng;  // geometry normal
+    float u;         // barycentric u coordinate of hit
+    float v;         // barycentric v coordinate of hit
+    unsigned int primID;      // primitive ID
+    unsigned int geomID;      // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Shortcuts */
+  typedef HitK<1>  Hit;
+  typedef HitK<4>  Hit4;
+  typedef HitK<8>  Hit8;
+  typedef HitK<16> Hit16;
+
+  /* Outputs hit to stream */
+  template<int K>
+  __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  Ng = " << ray.Ng <<  embree_endl
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  template<typename Hit>
+    __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+  {
+    ray.Ng   = hit.Ng;
+    ray.u    = hit.u;
+    ray.v    = hit.v;
+    ray.primID = hit.primID;
+    ray.geomID = hit.geomID;
+    instance_id_stack::copy(hit.instID, ray.instID);
+  }
+
+  template<int K>
+    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+  {
+    vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+    vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+    vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+    vfloat<K>::storeu(mask,&ray.u, hit.u);
+    vfloat<K>::storeu(mask,&ray.v, hit.v);
+    vuint<K>::storeu(mask,&ray.primID, hit.primID);
+    vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+    instance_id_stack::copy(hit.instID, ray.instID, mask);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d7e3637f7b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
@@ -0,0 +1,199 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, 
+              "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/* 
+ * Push an instance to the stack. 
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
+                          unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+  /* We assert here because instances are silently dropped when the stack is full. 
+     This might be quite hard to find in production. */
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[context->instStackSize++] = instanceId;
+  return spaceAvailable;
+#else
+  const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[0] = instanceId;
+  return spaceAvailable;
+#endif
+}
+
+
+/* 
+ * Pop the last instance pushed to the stack. 
+ * Do not call on an empty stack. 
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*******************************************************************************
+ * Optimized instance id stack copy.
+ * The copy() function at the bottom of this block will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ ******************************************************************************/
+
+/*
+ * Plain array assignment. This works for scalar->scalar,
+ * scalar->vector, and vector->vector.
+ */
+template <class Src, class Tgt>
+RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt)
+{
+  tgt[level] = src[level];
+}
+
+/*
+ * Masked SIMD vector->vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::storeu(mask, tgt + level, src[level]);
+}
+
+/*
+ * Masked scalar->SIMD vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::store(mask, tgt + level, src[level]);
+}
+
+/*
+ * Indexed assign from vector to scalar.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, unsigned* tgt, const size_t& idx)
+{
+  tgt[level] = src[level][idx];
+}
+
+/*
+ * Indexed assign from scalar to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const size_t& idx)
+{
+  tgt[level][idx] = src[level];
+}
+
+/*
+ * Indexed assign from vector to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const size_t& i, const size_t& j)
+{
+  tgt[level][j] = src[level][i];
+}
+
+/*
+ * Check if the given stack level is valid.
+ * These are only used for large max stack sizes.
+ */
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool<K>& /*mask*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack)
+{
+  return any(stack[level] != RTC_INVALID_GEOMETRY_ID);
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const vbool<K>& mask)
+{
+  return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID));
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i, const size_t& /*j*/)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+
+/*
+ * Copy an instance ID stack.
+ *
+ * This function automatically selects a LevelFunctor from the above Assign 
+ * structs.
+ */
+template <class Src, class Tgt, class... Args>
+RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  /* 
+   * Avoid all loops for only one level. 
+   */
+  level_copy(0, src, tgt, std::forward<Args>(args)...);
+
+#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4)
+  /* 
+   * It is faster to avoid the valid test for low level counts.
+   * Just copy the whole stack.
+   */
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+
+#else
+  /* 
+   * For general stack sizes, it pays off to test for validity.
+   */
+  bool valid = true;
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l)
+  {
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+    valid = level_valid(l, src, std::forward<Args>(args)...);
+  }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
+
diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h
new file mode 100644
index 0000000000..63fb8d3351
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/isa.h
@@ -0,0 +1,271 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name)               \
+  typedef type (*name##Func)();                 \
+  name##Func name;
+  
+#define DECLARE_SYMBOL2(type,name)                                       \
+  namespace sse2      { extern type name(); }                           \
+  namespace sse42     { extern type name(); }                           \
+  namespace avx       { extern type name(); }                           \
+  namespace avx2      { extern type name(); }                           \
+  namespace avx512knl { extern type name(); }                           \
+  namespace avx512skx { extern type name(); }                           \
+  void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+  type name##_error() { return type(name##_error2); }                   \
+  type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args)                            \
+  namespace sse2      { extern type symbol(args); }                       \
+  namespace sse42     { extern type symbol(args); }                       \
+  namespace avx       { extern type symbol(args); }                       \
+  namespace avx2      { extern type symbol(args); }                       \
+  namespace avx512knl { extern type symbol(args); }                       \
+  namespace avx512skx { extern type symbol(args); }                     \
+  inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+  typedef type (*symbol##Ty)(args);                                       \
+  
+#define DEFINE_ISA_FUNCTION(type,symbol,args)   \
+  typedef type (*symbol##Func)(args);           \
+  symbol##Func symbol;
+  
+#define ZERO_SYMBOL(features,intersector)                      \
+  intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector)                      \
+  intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+  intersector = isa::intersector;
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+  if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+  if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512KNL)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512KNL(features,intersector) \
+  if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector;
+#else
+#define SELECT_SYMBOL_AVX512KNL(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512SKX)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512SKX(features,intersector) \
+  if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector;
+#else
+#define SELECT_SYMBOL_AVX512SKX(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);            \
+  SELECT_SYMBOL_SSE42(features,intersector);                                  
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                \
+  SELECT_SYMBOL_SSE42(features,intersector);                  \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                     \
+  SELECT_SYMBOL_SSE42(features,intersector);                       \
+  SELECT_SYMBOL_AVX(features,intersector);                         \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                          \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                         \
+  SELECT_SYMBOL_AVX(features,intersector);                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                         \
+  SELECT_SYMBOL_SSE42(features,intersector);                                           \
+  SELECT_SYMBOL_AVX(features,intersector);                                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                                            \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                       \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                               \
+  SELECT_SYMBOL_SSE42(features,intersector);                                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);          \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                              \
+  SELECT_SYMBOL_AVX(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                      \
+  SELECT_SYMBOL_AVX(features,intersector);                \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                            \
+  SELECT_SYMBOL_SSE42(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                      \
+  SELECT_SYMBOL_AVX2(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                           \
+  SELECT_SYMBOL_AVX(features,intersector);                     \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                     \
+  SELECT_SYMBOL_AVX(features,intersector);                               \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                          \
+  SELECT_SYMBOL_AVX(features,intersector);                                    \
+  SELECT_SYMBOL_AVX2(features,intersector);                                   \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                                \
+  SELECT_SYMBOL_SSE42(features,intersector);                                        \
+  SELECT_SYMBOL_AVX(features,intersector);                                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                    \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  ZERO_SYMBOL(features,intersector);                                    \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX2(features,intersector);                             \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                               \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                     \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_SSE42(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                 \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+  struct VerifyMultiTargetLinking {
+    static __noinline int getISA(int depth = 5) { 
+      if (depth == 0) return ISA; 
+      else return getISA(depth-1); 
+    }
+  };
+  namespace sse2      { int getISA(); };
+  namespace sse42     { int getISA(); };
+  namespace avx       { int getISA(); };
+  namespace avx2      { int getISA(); };
+  namespace avx512knl { int getISA(); };
+  namespace avx512skx { int getISA(); };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..82953f0e89
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+  float theta;
+  float coeffs[3*8*7];
+
+  MotionDerivativeCoefficients() {}
+
+  // xfm0 and xfm1 are interpret as quaternion decomposition
+  MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+  {
+    // cosTheta of the two quaternions
+    const float cosTheta = min(1.f, max(-1.f,
+                           xfm0.l.vx.w * xfm1.l.vx.w
+                         + xfm0.l.vy.w * xfm1.l.vy.w
+                         + xfm0.l.vz.w * xfm1.l.vz.w
+                         + xfm0.p.w * xfm1.p.w));
+
+    theta = std::acos(cosTheta);
+    Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+    if (cosTheta < 0.995f) {
+      // compute perpendicular quaternion
+      qperp.x = xfm1.p.w    - cosTheta * xfm0.p.w;
+      qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+      qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+      qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+      qperp = normalize(qperp);
+    }
+    const float p[33] = {
+      theta,
+      xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+      xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+      xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+      qperp.x, qperp.y, qperp.z, qperp.w,
+      xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+                   xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+                                xfm0.l.vz.z, xfm0.p.z,
+      xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+                   xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+                                xfm1.l.vz.z, xfm1.p.z
+    };
+    motion_derivative_coefficients(p, coeffs);
+  }
+};
+
+struct MotionDerivative
+{
+  float twoTheta;
+  float c[8];
+
+  MotionDerivative(MotionDerivativeCoefficients const& mdc,
+                    int dim, Vec3fa const& p0, Vec3fa const& p1)
+    : twoTheta(2.f*mdc.theta)
+  {
+    const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+    for (int i = 0; i < 8; ++i) {
+      c[i] = 0;
+      for (int j = 0; j < 7; ++j) {
+        c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+      }
+    }
+  }
+
+  template<typename T>
+  struct EvalMotionDerivative
+  {
+    MotionDerivative const& md;
+    float offset;
+
+    EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+    T operator()(T const& time) const {
+      return md.c[0] + md.c[1] * time
+          + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+          + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+          + offset;
+    }
+  };
+
+  unsigned int findRoots(
+    Interval1f const& interval,
+    float offset,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    unsigned int numRoots = 0;
+    EvalMotionDerivative<Interval1f> eval(*this, offset);
+    findRoots(eval, interval, numRoots, roots, maxNumRoots);
+    return numRoots;
+  }
+
+  template<typename Eval>
+  static void findRoots(
+
+    Eval const& eval,
+    Interval1f const& interval,
+    unsigned int& numRoots,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    Interval1f range = eval(interval);
+    if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+    const float split = 0.5f * (interval.upper + interval.lower);
+    if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f ||  abs(split-interval.upper) < 1e-7f)
+    {
+      // check if the root already exists
+      for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+        if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+        return;
+      }
+      if (numRoots < maxNumRoots) {
+        roots[numRoots++] = split;
+      }
+      if (numRoots > maxNumRoots) {
+        printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+        return;
+      }
+      return;
+    }
+
+    findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+    findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+  }
+};
+
+/******************************************************************************
+ *                       Code generated with sympy 1.4                        *
+ *              See http://www.sympy.org/ for more information.               *
+ *                                                                            *
+ * see                                                                        *
+ *                                                                            *
+ *     scripts/generate_motion_derivative_coefficients.py                     *
+ *                                                                            *
+ * for how this code is generated                                             *
+ *                                                                            *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+   coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+   coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+   coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+   coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+   coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+   coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+   coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+   coeff[7] = 0;
+   coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+   coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+   coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+   coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+   coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+   coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+   coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+   coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+   coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+   coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+   coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+   coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+   coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+   coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+   coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+   coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+   coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+   coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+   coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+   coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+   coeff[28] = 0;
+   coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+   coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+   coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+   coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+   coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+   coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+   coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+   coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+   coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+   coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+   coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+   coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+   coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+   coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+   coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+   coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+   coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+   coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+   coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+   coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+   coeff[49] = 0;
+   coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+   coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+   coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+   coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+   coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+   coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+   coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+   coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+   coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+   coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+   coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+   coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+   coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+   coeff[63] = 0;
+   coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+   coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+   coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+   coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+   coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+   coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+   coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+   coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+   coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+   coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+   coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+   coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+   coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+   coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+   coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+   coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+   coeff[84] = 0;
+   coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+   coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+   coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+   coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+   coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+   coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+   coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+   coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+   coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+   coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+   coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+   coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+   coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+   coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+   coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+   coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+   coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+   coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+   coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+   coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+   coeff[105] = 0;
+   coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+   coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+   coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+   coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+   coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+   coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+   coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+   coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+   coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+   coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+   coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+   coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+   coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+   coeff[119] = 0;
+   coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+   coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+   coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+   coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+   coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+   coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+   coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+   coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+   coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+   coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+   coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+   coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+   coeff[140] = 0;
+   coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+   coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+   coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+   coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+   coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+   coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+   coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+   coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+   coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+   coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+   coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+   coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+   coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+   coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+   coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+   coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+   coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+   coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+   coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+   coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+   coeff[161] = 0;
+   coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+   coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+   coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+   coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+   coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+   coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h
new file mode 100644
index 0000000000..27d158ca3a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* Point query structure for closest point query */
+  template<int K>
+  struct RTC_ALIGN(16) PointQueryK 
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = radius >= vfloat<K>(0);
+      const vbool<K> vf = abs(time) < vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(PointQueryK<1>* ray) const;
+    __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+    __forceinline void set(const PointQueryK<1>* ray);
+    __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+    Vec3vf<K> p;      // location of the query point
+    vfloat<K> time;   // time for motion blur
+    vfloat<K> radius; // radius for the point query
+  };
+  
+  /* Specialization for a single point query */
+  template<>
+  struct RTC_ALIGN(16) PointQueryK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+    }
+
+    Vec3f p;  
+    float time;
+    float radius;
+  };
+  
+  /* Converts point query packet to single point query */
+  template<int K>
+  __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      query[i].p.x    = p.x[i]; 
+      query[i].p.y    = p.y[i]; 
+      query[i].p.z    = p.z[i];
+      query[i].time   = time[i];
+      query[i].radius = radius[i]; 
+    }
+  }
+
+  /* Extracts a single point query out of a point query packet*/
+  template<int K>
+  __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+  {
+    query.p.x    = p.x[i]; 
+    query.p.y    = p.y[i]; 
+    query.p.z    = p.z[i];
+    query.radius = radius[i];  
+    query.time   = time[i];  
+  }
+
+  /* Converts single point query to point query packet */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+  {
+    for (size_t i = 0; i < K; i++)
+    {
+      p.x[i]    = query[i].p.x;
+      p.y[i]    = query[i].p.y;
+      p.z[i]    = query[i].p.z;
+      radius[i] = query[i].radius; 
+      time[i]   = query[i].time; 
+    }
+  }
+
+  /* inserts a single point query into a point query packet element */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+  {
+    p.x[i]    = query.p.x;
+    p.y[i]    = query.p.y;
+    p.z[i]    = query.p.z;
+    radius[i] = query.radius; 
+    time[i]   = query.time; 
+  }
+
+  /* Shortcuts */
+  typedef PointQueryK<1>  PointQuery;
+  typedef PointQueryK<4>  PointQuery4;
+  typedef PointQueryK<8>  PointQuery8;
+  typedef PointQueryK<16> PointQuery16;
+  struct PointQueryN;
+
+  /* Outputs point query to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+  {
+    cout << "{ " << embree_endl
+        << "  p = "    << query.p      << embree_endl
+        << "  r = "    << query.radius << embree_endl
+        << "  time = " << query.time   << embree_endl
+        << "}";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h
new file mode 100644
index 0000000000..ce75c982bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(32) PrimRef 
+  {
+    __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+    __forceinline PrimRef(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v));
+    }
+    __forceinline PrimRef& operator=(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+    }
+#endif
+
+    __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) 
+    {
+      lower = Vec3fx(bounds.lower, geomID);
+      upper = Vec3fx(bounds.upper, primID);
+    }
+
+    __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
+    {
+#if defined(__X86_64__) || defined(__aarch64__)
+      lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+      upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+      lower = Vec3fx(bounds.lower, (unsigned)id);
+      upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+    }
+
+    /*! calculates twice the center of the primitive */
+    __forceinline const Vec3fa center2() const {
+      return lower+upper;
+    }
+    
+    /*! return the bounding box of the primitive */
+    __forceinline const BBox3fa bounds() const {
+      return BBox3fa(lower,upper);
+    }
+
+    /*! size for bin heuristic is 1 */
+    __forceinline unsigned size() const { 
+      return 1;
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const 
+    {
+      bounds_o = bounds();
+      center_o = embree::center2(bounds_o);
+    }
+
+    __forceinline unsigned& geomIDref() {  // FIXME: remove !!!!!!!
+      return lower.u;
+    }
+    __forceinline unsigned& primIDref() {  // FIXME: remove !!!!!!!
+      return upper.u;
+    }
+    
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const { 
+      return lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const { 
+      return upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+      return size_t(lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+      return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+    }
+
+  public:
+    Vec3fx lower;     //!< lower bounds and geomID
+    Vec3fx upper;     //!< upper bounds and primID
+  };
+
+  /*! fast exchange for PrimRefs */
+  __forceinline void xchg(PrimRef& a, PrimRef& b)
+  {
+#if defined(__AVX__)
+    const vfloat8 aa = vfloat8::load((float*)&a);
+    const vfloat8 bb = vfloat8::load((float*)&b);
+    vfloat8::store((float*)&a,bb);
+    vfloat8::store((float*)&b,aa);
+#else
+    std::swap(a,b);
+#endif
+  }
+
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  
+  struct SubGridBuildData {
+    unsigned short sx,sy;
+    unsigned int primID;
+    
+    __forceinline SubGridBuildData() {};
+    __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+    
+    __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+    __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..b6c1ad5712
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct PrimRefMB
+  {
+    typedef LBBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      lbounds.bounds0.lower.a = geomID;
+      lbounds.bounds0.upper.a = primID;
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.a = id;
+      lbounds.bounds0.upper.a = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+    
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.u = id;
+      lbounds.bounds0.upper.u = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    /*! returns bounds for binning */
+    __forceinline LBBox3fa bounds() const {
+      return lbounds;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned size() const {
+      return lbounds.bounds1.lower.a;
+    }
+
+    __forceinline unsigned totalTimeSegments() const {
+      return lbounds.bounds1.upper.a;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)totalTimeSegments());
+      return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(lbounds.interpolate(0.5f));
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = binCenter();
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const {
+      return lbounds.bounds0.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const {
+      return lbounds.bounds0.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+      return size_t(lbounds.bounds0.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    LBBox3fx lbounds;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#else
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(16) PrimRefMB
+  {
+    typedef BBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      bbox.lower.a = geomID;
+      bbox.upper.a = primID;
+    }
+    
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      bbox.lower.u = id & 0xFFFFFFFF;
+      bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      bbox.lower.u = id;
+      bbox.upper.u = 0;
+#endif
+    }
+    
+    /*! returns bounds for binning */
+    __forceinline BBox3fa bounds() const {
+      return bbox;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned int size() const { 
+      return _activeTimeSegments;
+    }
+
+    __forceinline unsigned int totalTimeSegments() const { 
+      return _totalTimeSegments;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)_totalTimeSegments);
+      return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(bounds());
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = center2(bounds());
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned int geomID() const { 
+      return bbox.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned int primID() const { 
+      return bbox.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+      return size_t(bbox.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    BBox3fa bbox; // bounds, geomID, primID
+    unsigned int _activeTimeSegments;
+    unsigned int _totalTimeSegments;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h
new file mode 100644
index 0000000000..a7de36414d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! helper structure for the implementation of the profile functions below */
+  struct ProfileTimer
+  {
+    static const size_t N = 20;
+    
+    ProfileTimer () {}
+
+    ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+    {
+      for (size_t i=0; i<N; i++) names[i] = nullptr;
+      for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+      for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+    }
+    
+    __forceinline void begin() 
+    {
+      j=0;
+      t0 = tj = getSeconds();
+    }
+
+    __forceinline void end() {
+      absolute("total");
+      i++;
+    }
+
+    __forceinline void operator() (const char* name) {
+      relative(name);
+    }
+
+    __forceinline void absolute (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-t0;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    __forceinline void relative (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-tj;
+      tj = t1;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    void print(size_t numElements) 
+    {
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      printf("  profile [M/s]:\n");
+      for (size_t j=0; j<maxJ; j++)
+        printf("%20s:  fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+               names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+      printf("  profile [ms]:\n");
+      for (size_t j=0; j<maxJ; j++) 
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+    }
+
+    void print() 
+    {
+      printf("  profile:\n");
+
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      for (size_t j=0; j<maxJ; j++) {
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+      }
+    }
+
+    double avg() {
+      return dt_avg[maxJ-1]/double(i-numSkip);
+    }
+    
+  private:
+    size_t i;
+    size_t j;
+    size_t maxJ;
+    size_t numSkip;
+    double t0;
+    double tj;
+    const char* names[N];
+    double dt_fst[N];
+    double dt_min[N];
+    double dt_avg[N];
+    double dt_max[N];
+  };
+
+  /*! This function executes some code block multiple times and measured sections of it. 
+      Use the following way:
+
+      profile(1,10,1000,[&](ProfileTimer& timer) {
+        // code
+        timer("A");
+        // code 
+        timer("B");
+      });
+  */
+  template<typename Closure>
+    void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      ProfileTimer timer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+
+  /*! similar as the function above, but the timer object comes externally */
+  template<typename Closure>
+    void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      timer = ProfileTimer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/ray.h b/thirdparty/embree-aarch64/kernels/common/ray.h
new file mode 100644
index 0000000000..336d48942c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+  /* Ray structure for K rays */
+  template<int K>
+  struct RayK
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                       const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                       const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+      const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(RayK<1>* ray) const;
+    __forceinline void get(size_t i, RayK<1>& ray) const;
+    __forceinline void set(const RayK<1>* ray);
+    __forceinline void set(size_t i, const RayK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    __forceinline vint<K> octant() const
+    {
+      return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+             select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+             select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+    }
+
+    /* Ray data */
+    Vec3vf<K> org;    // ray origin
+    vfloat<K> _tnear; // start of ray segment
+    Vec3vf<K> dir;    // ray direction
+    vfloat<K> _time;  // time of this ray for motion blur
+    vfloat<K> tfar;   // end of ray segment
+    vint<K> mask;     // used to mask out objects during traversal
+    vint<K> id;      
+    vint<K> flags;  
+
+    __forceinline vfloat<K>& tnear() { return _tnear; }
+    __forceinline vfloat<K>& time()  { return _time; }
+    __forceinline const vfloat<K>& tnear() const { return _tnear; }
+    __forceinline const vfloat<K>& time()  const { return _time; }
+  };
+
+  /* Ray+hit structure for K rays */
+  template<int K>
+  struct RayHitK : RayK<K>
+  {
+    using RayK<K>::org;
+    using RayK<K>::_tnear;
+    using RayK<K>::dir;
+    using RayK<K>::_time;
+    using RayK<K>::tfar;
+    using RayK<K>::mask;
+    using RayK<K>::id;
+    using RayK<K>::flags;
+
+    using RayK<K>::tnear;
+    using RayK<K>::time;
+
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                          const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                          const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK(const RayK<K>& ray)
+      : RayK<K>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+    {
+      org    = ray.org;
+      _tnear = ray._tnear;
+      dir    = ray.dir;
+      _time  = ray._time;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit(const vbool<K>& valid0) const
+    {
+      vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+      const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+      const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+      if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+      if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+      if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+      if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+      if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+      if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+    }
+
+    __forceinline void get(RayHitK<1>* ray) const;
+    __forceinline void get(size_t i, RayHitK<1>& ray) const;
+    __forceinline void set(const RayHitK<1>* ray);
+    __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    /* Hit data */
+    Vec3vf<K> Ng;   // geometry normal
+    vfloat<K> u;    // barycentric u coordinate of hit
+    vfloat<K> v;    // barycentric v coordinate of hit
+    vuint<K> primID; // primitive ID
+    vuint<K> geomID; // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Specialization for a single ray */
+  template<>
+  struct RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+    }
+
+    /* Ray data */
+    Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
+    //float tnear; // start of ray segment
+    Vec3ff dir;  // 3 floats for ray direction, 1 float for time
+    // float time; 
+    float tfar;  // end of ray segment
+    int mask;    // used to mask out objects during traversal
+    int id;      // ray ID
+    int flags;   // ray flags
+
+    __forceinline float& tnear() { return org.w; };
+    __forceinline const float& tnear() const { return org.w; };
+
+    __forceinline float& time() { return dir.w; };
+    __forceinline const float& time() const { return dir.w; };
+
+  };
+
+  template<>
+  struct RayHitK<1> : RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK(const RayK<1>& ray)
+      : RayK<1>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+    {
+      org    = ray.org;
+      dir    = ray.dir;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit() const
+    {
+      if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+      const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+      const bool vu = (abs(u) <= FLT_LARGE);
+      const bool vv = (abs(u) <= FLT_LARGE);
+      const bool vnx = abs(Ng.x) <= FLT_LARGE;
+      const bool vny = abs(Ng.y) <= FLT_LARGE;
+      const bool vnz = abs(Ng.z) <= FLT_LARGE;
+      if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+      if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+      if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+      if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+      if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+      if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+    }
+
+    /* Hit data */
+    Vec3f Ng;            // not normalized geometry normal
+    float u;             // barycentric u coordinate of hit
+    float v;             // barycentric v coordinate of hit
+    unsigned int primID; // primitive ID
+    unsigned int geomID; // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Converts ray packet to single rays */
+  template<int K>
+  __forceinline void RayK<K>::get(RayK<1>* ray) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+      ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time()  = time()[i];
+      ray[i].tfar  = tfar[i];  ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+    }
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      get(i, ray[i]);
+  }
+
+  /* Extracts a single ray out of a ray packet*/
+  template<int K>
+  __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; 
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time()  = time()[i];  
+    ray.tfar  = tfar[i]; ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar  = tfar[i]; ray.time()  = time()[i]; 
+    ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+    ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+    ray.u = u[i]; ray.v = v[i];
+    ray.primID = primID[i]; ray.geomID = geomID[i]; 
+
+    instance_id_stack::copy(instID, ray.instID, i);
+  }
+
+  /* Converts single rays to ray packet */
+  template<int K>
+  __forceinline void RayK<K>::set(const RayK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  /* inserts a single ray into a ray packet element */
+  template<int K>
+  __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+    Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+    u[i] = ray.u; v[i] = ray.v;
+    primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+    instance_id_stack::copy(ray.instID, instID, i);
+  }
+
+  /* copies a ray packet element into another element*/
+  template<int K>
+  __forceinline void RayK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; 
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+    Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+    u[dest] = u[source]; v[dest] = v[source];
+    primID[dest] = primID[source]; geomID[dest] = geomID[source];  
+
+    instance_id_stack::copy(instID, instID, source, dest);
+  }
+
+  /* Shortcuts */
+  typedef RayK<1>  Ray;
+  typedef RayK<4>  Ray4;
+  typedef RayK<8>  Ray8;
+  typedef RayK<16> Ray16;
+  struct RayN;
+
+  typedef RayHitK<1>  RayHit;
+  typedef RayHitK<4>  RayHit4;
+  typedef RayHitK<8>  RayHit8;
+  typedef RayHitK<16> RayHit16;
+  struct RayHitN;
+
+  template<int K, bool intersect>
+  struct RayTypeHelper;
+
+  template<int K>
+  struct RayTypeHelper<K, true>
+  {
+    typedef RayHitK<K> Ty;
+  };
+
+  template<int K>
+  struct RayTypeHelper<K, false>
+  {
+    typedef RayK<K> Ty;
+  };
+
+  template<bool intersect>
+  using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+  template<int K, bool intersect>
+  using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+  /* Outputs ray to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+  {
+    return cout << "{ " << embree_endl
+                << "  org = " << ray.org << embree_endl
+                << "  dir = " << ray.dir << embree_endl
+                << "  near = " << ray.tnear() << embree_endl
+                << "  far = " << ray.tfar << embree_endl
+                << "  time = " << ray.time() << embree_endl
+                << "  mask = " << ray.mask << embree_endl
+                << "  id = " << ray.id << embree_endl
+                << "  flags = " << ray.flags << embree_endl
+                << "}";
+  }
+
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  org = " << ray.org << embree_endl
+         << "  dir = " << ray.dir << embree_endl
+         << "  near = " << ray.tnear() << embree_endl
+         << "  far = " << ray.tfar << embree_endl
+         << "  time = " << ray.time() << embree_endl
+         << "  mask = " << ray.mask << embree_endl
+         << "  id = " << ray.id << embree_endl
+         << "  flags = " << ray.flags << embree_endl
+         << "  Ng = " << ray.Ng
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  struct RayStreamSOA
+  {
+    __forceinline RayStreamSOA(void* rays, size_t N)
+      : ptr((char*)rays), N(N) {}
+
+    /* ray data access functions */
+    __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; }  // x coordinate of ray origin
+    __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; }  // y coordinate of ray origin
+    __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+    __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+    __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+    __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+    __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+    __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+    __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+    __forceinline int*   mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset];   }; // used to mask out objects during traversal (optional)
+    __forceinline int*   id   (size_t offset = 0) { return (int*)&ptr[10*4*N+offset];  }; // id
+    __forceinline int*   flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset];  }; // flags
+
+    /* hit data access functions */
+    __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+    __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+    __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+    __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; };    // barycentric u coordinate of hit
+    __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; };    // barycentric v coordinate of hit
+
+    __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
+    __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
+    __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = org_x(offset)[0];
+      ray.org.y   = org_y(offset)[0];
+      ray.org.z   = org_z(offset)[0];
+      ray.tnear() = tnear(offset)[0];
+      ray.dir.x   = dir_x(offset)[0];
+      ray.dir.y   = dir_y(offset)[0];
+      ray.dir.z   = dir_z(offset)[0];
+      ray.time()  = time(offset)[0];
+      ray.tfar    = tfar(offset)[0];
+      ray.mask    = mask(offset)[0];
+      ray.id      = id(offset)[0];
+      ray.flags   = flags(offset)[0];
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x  = vfloat<K>::loadu(org_x(offset));
+      ray.org.y  = vfloat<K>::loadu(org_y(offset));
+      ray.org.z  = vfloat<K>::loadu(org_z(offset));
+      ray.tnear  = vfloat<K>::loadu(tnear(offset));
+      ray.dir.x  = vfloat<K>::loadu(dir_x(offset));
+      ray.dir.y  = vfloat<K>::loadu(dir_y(offset));
+      ray.dir.z  = vfloat<K>::loadu(dir_z(offset));
+      ray.time   = vfloat<K>::loadu(time(offset));
+      ray.tfar   = vfloat<K>::loadu(tfar(offset));
+      ray.mask   = vint<K>::loadu(mask(offset));
+      ray.id     = vint<K>::loadu(id(offset));
+      ray.flags  = vint<K>::loadu(flags(offset));
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, org_x(offset));
+      ray.org.y   = vfloat<K>::loadu(valid, org_y(offset));
+      ray.org.z   = vfloat<K>::loadu(valid, org_z(offset));
+      ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, dir_x(offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, dir_y(offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, dir_z(offset));
+      ray.time()  = vfloat<K>::loadu(valid, time(offset));
+      ray.tfar  = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+      /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+         because the SSE masked loads always access the entire vector */
+      if (unlikely(!all(valid)))
+      {
+        ray.mask  = zero;
+        ray.id    = zero;
+        ray.flags = zero;
+
+        for (size_t k = 0; k < K; k++)
+        {
+          if (likely(valid[k]))
+          {
+            ray.mask[k]  = mask(offset)[k];
+            ray.id[k]    = id(offset)[k];
+            ray.flags[k] = flags(offset)[k];
+          }
+        }
+      }
+      else
+#endif
+      {
+        ray.mask  = vint<K>::loadu(valid, mask(offset));
+        ray.id    = vint<K>::loadu(valid, id(offset));
+        ray.flags = vint<K>::loadu(valid, flags(offset));
+      }
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      /* 
+       * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+       * valid:   stores which of the rays actually hit something.
+       */
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+        vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+        vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+        vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, u(offset), ray.u);
+        vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+        /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+           because the SSE masked stores always access the entire vector */
+        if (unlikely(!all(valid_i)))
+        {
+          for (size_t k = 0; k < K; k++)
+          {
+            if (likely(valid[k]))
+            {
+              primID(offset)[k] = ray.primID[k];
+              geomID(offset)[k] = ray.geomID[k];
+
+              instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+                instID(l, offset)[k] = ray.instID[l][k];
+#endif
+            }
+          }
+        }
+        else
+#endif
+        {
+          vuint<K>::storeu(valid, primID(offset), ray.primID);
+          vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+          vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = dir_x(offset)[0];
+      const float dy = dir_y(offset)[0];
+      const float dz = dir_z(offset)[0];
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear(offset)[0];
+      const float ffar  = tfar(offset)[0];
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x(), offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y(), offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z(), offset);
+      ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+      ray.time()  = vfloat<K>::template gather<1>(valid, time(), offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar(), offset);
+      ray.mask    = vint<K>::template gather<1>(valid, mask(), offset);
+      ray.id      = vint<K>::template gather<1>(valid, id(), offset);
+      ray.flags   = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.time()  = zero;
+      ray.tfar    = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *org_x(ofs);
+          ray.org.y[k]   = *org_y(ofs);
+          ray.org.z[k]   = *org_z(ofs);
+          ray.tnear()[k] = *tnear(ofs);
+          ray.dir.x[k]   = *dir_x(ofs);
+          ray.dir.y[k]   = *dir_y(ofs);
+          ray.dir.z[k]   = *dir_z(ofs);
+          ray.time()[k]  = *time(ofs);
+          ray.tfar[k]    = *tfar(ofs);
+          ray.mask[k]    = *mask(ofs);
+          ray.id[k]      = *id(ofs);
+          ray.flags[k]   = *flags(ofs);
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+        vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+
+          *Ng_x(ofs)   = ray.Ng.x[k];
+          *Ng_y(ofs)   = ray.Ng.y[k];
+          *Ng_z(ofs)   = ray.Ng.z[k];
+          *u(ofs)      = ray.u[k];
+          *v(ofs)      = ray.v[k];
+          *primID(ofs) = ray.primID[k];
+          *geomID(ofs) = ray.geomID[k];
+
+          *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *instID(l, ofs) = ray.instID[l][k];
+#endif
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    char* __restrict__ ptr;
+    size_t N;
+  };
+
+  template<size_t MAX_K>
+  struct StackRayStreamSOA : public RayStreamSOA
+  {
+    __forceinline StackRayStreamSOA(size_t K)
+      : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+    char data[MAX_K / 4 * sizeof(RayHit4)];
+  };
+
+
+  struct RayStreamSOP
+  {
+    template<class T>
+    __forceinline void init(T& t)
+    {
+      org_x  = (float*)&t.org.x;
+      org_y  = (float*)&t.org.y;
+      org_z  = (float*)&t.org.z;
+      tnear  = (float*)&t.tnear;
+      dir_x  = (float*)&t.dir.x;
+      dir_y  = (float*)&t.dir.y;
+      dir_z  = (float*)&t.dir.z;
+      time   = (float*)&t.time;
+      tfar   = (float*)&t.tfar;
+      mask   = (unsigned int*)&t.mask;
+      id     = (unsigned int*)&t.id;
+      flags  = (unsigned int*)&t.flags;
+
+      Ng_x   = (float*)&t.Ng.x;
+      Ng_y   = (float*)&t.Ng.y;
+      Ng_z   = (float*)&t.Ng.z;
+      u      = (float*)&t.u;
+      v      = (float*)&t.v;
+      primID = (unsigned int*)&t.primID;
+      geomID = (unsigned int*)&t.geomID;
+
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = (unsigned int*)&t.instID[l];
+    }
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = *(float* __restrict__)((char*)org_x + offset);
+      ray.org.y   = *(float* __restrict__)((char*)org_y + offset);
+      ray.org.z   = *(float* __restrict__)((char*)org_z + offset);
+      ray.dir.x   = *(float* __restrict__)((char*)dir_x + offset);
+      ray.dir.y   = *(float* __restrict__)((char*)dir_y + offset);
+      ray.dir.z   = *(float* __restrict__)((char*)dir_z + offset);
+      ray.tfar  = *(float* __restrict__)((char*)tfar + offset);
+      ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      ray.time()  = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+      ray.mask    = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+      ray.id      = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+      ray.flags   = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+      ray.org.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+      ray.org.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      ray.tfar    = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      ray.time()  = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+      ray.mask    = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+      ray.id      = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+      ray.flags   = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+    {
+      Vec3vf<K> dir;
+      dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      return dir;
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+    {
+      if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+      {
+        *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+        if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+        if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+        if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+        *(float* __restrict__)((char*)u + offset) = ray.u;
+        *(float* __restrict__)((char*)v + offset) = ray.v;
+        *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+        *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+        if (likely(instID[0])) {
+          *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+        }
+      }
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+    {
+      *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+        if (likely(instID[0])) {
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = *(float* __restrict__)((char*)dir_x + offset);
+      const float dy = *(float* __restrict__)((char*)dir_y + offset);
+      const float dz = *(float* __restrict__)((char*)dir_z + offset);
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      const float ffar  = *(float* __restrict__)((char*)tfar + offset);
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+    {
+      const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      const vfloat<K> ffar  = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x, offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y, offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z, offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x, offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y, offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z, offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar, offset);
+      ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+      ray.time()  = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+      ray.mask    = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+      ray.id      = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+      ray.flags   = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.tfar    = zero;
+      ray.time()  = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *(float* __restrict__)((char*)org_x + ofs);
+          ray.org.y[k]   = *(float* __restrict__)((char*)org_y + ofs);
+          ray.org.z[k]   = *(float* __restrict__)((char*)org_z + ofs);
+          ray.dir.x[k]   = *(float* __restrict__)((char*)dir_x + ofs);
+          ray.dir.y[k]   = *(float* __restrict__)((char*)dir_y + ofs);
+          ray.dir.z[k]   = *(float* __restrict__)((char*)dir_z + ofs);
+          ray.tfar[k]  = *(float* __restrict__)((char*)tfar + ofs);
+          ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+          ray.time()[k]  = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+          ray.mask[k]    = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+          ray.id[k]      = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+          ray.flags[k]   = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+        if (likely(instID[0])) {
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+        }
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+          if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+          if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+          if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+          *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+          *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+          *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+          *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+          if (likely(instID[0])) {
+            *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+          }
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    /* ray data */
+    float* __restrict__ org_x; // x coordinate of ray origin
+    float* __restrict__ org_y; // y coordinate of ray origin
+    float* __restrict__ org_z; // z coordinate of ray origin
+    float* __restrict__ tnear; // start of ray segment (optional)
+
+    float* __restrict__ dir_x; // x coordinate of ray direction
+    float* __restrict__ dir_y; // y coordinate of ray direction
+    float* __restrict__ dir_z; // z coordinate of ray direction
+    float* __restrict__ time;         // time of this ray for motion blur (optional)
+
+    float* __restrict__ tfar;  // end of ray segment (set to hit distance)
+    unsigned int* __restrict__ mask;  // used to mask out objects during traversal (optional)
+    unsigned int* __restrict__ id;    // ray ID
+    unsigned int* __restrict__ flags; // ray flags
+
+    /* hit data */
+    float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+    float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+    float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+    float* __restrict__ u;    // barycentric u coordinate of hit
+    float* __restrict__ v;    // barycentric v coordinate of hit
+
+    unsigned int* __restrict__ primID; // primitive ID
+    unsigned int* __restrict__ geomID; // geometry ID
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+  };
+
+
+  struct RayStreamAOS
+  {
+    __forceinline RayStreamAOS(void* rays)
+      : ptr((Ray*)rays) {}
+
+    __forceinline Ray& getRayByOffset(size_t offset)
+    {
+      return *(Ray*)((char*)ptr + offset);
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      const vint<K> valid_offset = select(valid, offset, vintx(zero));
+      return getRayByOffset(valid_offset);
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+          ray_k->tfar   = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+          ray_k->tfar = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    Ray* __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+    const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+    const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+    const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+    const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+    const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+    const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+    const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+    const vfloat8 ab1  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+    const vfloat8 ab2  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+    const vfloat8 ab3  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+    const vfloat8 ab4  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+    const vfloat8 ab5  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+    const vfloat8 ab6  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+    const vfloat8 ab7  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+    const vfloat8 ab8  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+    const vfloat8 ab9  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+    const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+    const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+    const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+    const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+    const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+    const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+
+  struct RayStreamAOP
+  {
+    __forceinline RayStreamAOP(void* rays)
+      : ptr((Ray**)rays) {}
+
+    __forceinline Ray& getRayByIndex(size_t index)
+    {
+      return *ptr[index];
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+    {
+      const vint<K> valid_index = select(valid, index, vintx(zero));
+      return getRayByIndex(valid_index);
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+        }
+      }
+    }
+
+    Ray** __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+    const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+    const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+    const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+    const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+    const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+    const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1  = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2  = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3  = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4  = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5  = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6  = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7  = vfloat8::loadu(&ptr[index[7]]->org);
+    const vfloat8 ab8  = vfloat8::loadu(&ptr[index[8]]->org);
+    const vfloat8 ab9  = vfloat8::loadu(&ptr[index[9]]->org);
+    const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+    const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+    const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+    const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+    const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+    const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&ptr[index[7]]->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&ptr[index[8]]->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&ptr[index[9]]->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..625fbf6d4f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
@@ -0,0 +1,1799 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+  /* mutex to make API thread safe */
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    static std::mutex g_mutex;
+#else
+    static MutexSys g_mutex;
+#endif
+
+  RTC_API RTCDevice rtcNewDevice(const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewDevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    Device* device = new Device(config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API void rtcRetainDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refInc();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API void rtcReleaseDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refDec();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceProperty);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    return device->getProperty(prop);
+    RTC_CATCH_END(device);
+    return 0;
+  }
+
+  RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceProperty);
+    const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+    if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->setProperty(prop,val);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceError);
+    if (device == nullptr) return Device::getThreadErrorCode();
+    else                   return device->getDeviceErrorCode();
+    RTC_CATCH_END(device);
+    return RTC_ERROR_UNKNOWN;
+  }
+
+  RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceErrorFunction);
+    RTC_VERIFY_HANDLE(hdevice);
+    device->setErrorFunction(error, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+    device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSharedBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetBufferData);
+    RTC_VERIFY_HANDLE(hbuffer);
+    return buffer->data();
+    RTC_CATCH_END2(buffer);
+    return nullptr;
+  }
+
+  RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refInc();
+    RTC_CATCH_END2(buffer);
+  }
+  
+  RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refDec();
+    RTC_CATCH_END2(buffer);
+  }
+
+  RTC_API RTCScene rtcNewScene (RTCDevice hdevice) 
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewScene);
+    RTC_VERIFY_HANDLE(hdevice);
+    Scene* scene = new Scene((Device*)hdevice);
+    return (RTCScene) scene->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneDevice);
+    RTC_VERIFY_HANDLE(hscene);
+    return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+    RTC_CATCH_END2(scene);
+    return (RTCDevice)nullptr;
+  }
+
+  RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+    RTC_VERIFY_HANDLE(hscene);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    scene->setProgressMonitorFunction(progress,ptr);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneBuildQuality);
+    RTC_VERIFY_HANDLE(hscene);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    scene->setBuildQuality(quality);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->setSceneFlags(flags);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    return scene->getSceneFlags();
+    RTC_CATCH_END2(scene);
+    return RTC_SCENE_FLAG_NONE;
+  }
+  
+  RTC_API void rtcCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(false);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcJoinCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcJoinCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(true);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    BBox3fa bounds = scene->bounds.bounds();
+    bounds_o->lower_x = bounds.lower.x;
+    bounds_o->lower_y = bounds.lower.y;
+    bounds_o->lower_z = bounds.lower.z;
+    bounds_o->align0  = 0;
+    bounds_o->upper_x = bounds.upper.x;
+    bounds_o->upper_y = bounds.upper.y;
+    bounds_o->upper_z = bounds.upper.z;
+    bounds_o->align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (bounds_o == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+    if (scene->isModified())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    
+    bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+    bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+    bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+    bounds_o->bounds0.align0  = 0;
+    bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+    bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+    bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+    bounds_o->bounds0.align1  = 0;
+    bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+    bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+    bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+    bounds_o->bounds1.align0  = 0;
+    bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+    bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+    bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+    bounds_o->bounds1.align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+  {
+    Scene* scene0 = (Scene*) hscene0;
+    Scene* scene1 = (Scene*) hscene1;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene0);
+    RTC_VERIFY_HANDLE(hscene1);
+    if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+    auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+    scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+    RTC_CATCH_END(scene0->device);
+  }
+  
+  inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    bool changed = false;
+    if (userContext->instStackSize > 0)
+    {
+      const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+      float similarityScale = 0.f;
+      const bool similtude = similarityTransform(transform, &similarityScale);
+      assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+      PointQuery query_inst;
+      query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); 
+      query_inst.radius = query->radius * similarityScale;
+      query_inst.time = query->time;
+      
+      PointQueryContext context_inst(scene, (PointQuery*)query,
+        similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+        queryFunc, userContext, similarityScale, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+    }
+    else
+    {
+      PointQueryContext context(scene, (PointQuery*)query, 
+        POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+    }
+    return changed;
+  }
+
+  RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(userContext);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+    if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");   
+#endif
+
+    return pointQuery(scene, query, userContext, queryFunc, userPtr);
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery4* query4 = (PointQuery4*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      query4->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query4->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery8);
+    
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery8* query8 = (PointQuery8*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      query8->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query8->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery16* query16 = (PointQuery16*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      PointQuery query1; query16->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query16->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT3(normal.travs,1,1,1);
+    IntersectContext context(scene,user_context);
+    scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+    ((RayHit*)rayhit)->verifyHit();
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)rayhit)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit4* rayhit4 = (RayHit4*)rayhit;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit4->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit4->set(i,ray1);
+    }
+#else
+    scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)rayhit)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit8* rayhit8 = (RayHit8*) rayhit;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit8->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.intersect8(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)rayhit)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit16* rayhit16 = (RayHit16*) rayhit;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit16->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.intersect16(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
+        scene->intersectors.intersect(*rayhit,&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
+        scene->intersectors.intersect(*rn[0],&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N*M,N*M,N*M);
+    IntersectContext context(scene,user_context);
+
+    /* code path for single ray streams */
+    if (likely(N == 1))
+    {
+      /* fast code path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+      } 
+      /* normal codepath for single ray streams */
+      else {
+        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    IntersectContext context(scene,user_context);
+    scene->intersectors.occluded(*ray,&context);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)ray)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray4* ray4 = (Ray4*) ray;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray4->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray4->set(i,ray1);
+    }
+#else
+    scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+ 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)ray)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray8* ray8 = (Ray8*) ray;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray8->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.occluded8(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)ray)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray16* ray16 = (Ray16*) ray;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray16->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.occluded16(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray->tnear <= ray->tfar)) 
+        scene->intersectors.occluded (*ray,&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
+        scene->intersectors.occluded (*ray[0],&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N*M,N*N,N*N);
+    IntersectContext context(scene,user_context);
+
+    /* codepath for single rays */
+    if (likely(N == 1))
+    {
+      /* fast path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+          scene->intersectors.occluded (*(RTCRay*)ray,&context);
+      } 
+      /* codepath for normal ray streams */
+      else {
+        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
+    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
+    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
+    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
+    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
+    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
+    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
+    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refInc();
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcReleaseScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refDec();
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Scene> scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    geometry->setInstancedScene(scene);
+    RTC_CATCH_END2(geometry);
+  }
+
+  AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+  {
+    AffineSpace3fa space = one;
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+                             Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+                             Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+                             Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+                             Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+                             Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+                             Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+                             Vec3fa(xfm[12], xfm[13], xfm[14]));
+      break;
+
+    default: 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+    return space;
+  }
+
+  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+  }
+
+  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransform);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(xfm);
+    const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+    geometry->setTransform(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransformQuaternion);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(qd);
+    
+    AffineSpace3fx transform;
+    transform.l.vx.x = qd->scale_x;
+    transform.l.vy.y = qd->scale_y;
+    transform.l.vz.z = qd->scale_z;
+    transform.l.vy.x = qd->skew_xy;
+    transform.l.vz.x = qd->skew_xz;
+    transform.l.vz.y = qd->skew_yz;
+    transform.l.vx.y = qd->translation_x;
+    transform.l.vx.z = qd->translation_y;
+    transform.l.vy.z = qd->translation_z;
+    transform.p.x    = qd->shift_x;
+    transform.p.y    = qd->shift_y;
+    transform.p.z    = qd->shift_z;
+
+    // normalize quaternion
+    Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+    q = normalize(q);
+    transform.l.vx.w = q.i;
+    transform.l.vy.w = q.j;
+    transform.l.vz.w = q.k;
+    transform.p.w    = q.r;
+
+    geometry->setQuaternionDecomposition(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransform);
+    const AffineSpace3fa transform = geometry->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+
+  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+  
+  RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewGeometry);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    switch (type)
+    {
+    case RTC_GEOMETRY_TYPE_TRIANGLE:
+    {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+      createTriangleMeshTy createTriangleMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh);
+      Geometry* geom = createTriangleMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_QUAD:
+    {
+#if defined(EMBREE_GEOMETRY_QUAD)
+      createQuadMeshTy createQuadMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh);
+      Geometry* geom = createQuadMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+    case RTC_GEOMETRY_TYPE_DISC_POINT:
+    case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+    {
+#if defined(EMBREE_GEOMETRY_POINT)
+      createPointsTy createPoints = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints);
+
+      Geometry *geom;
+      switch(type) {
+        case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+          geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_DISC_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+          break;
+        default:
+          geom = nullptr;
+          break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+    {
+#if defined(EMBREE_GEOMETRY_CURVE)
+      createLineSegmentsTy createLineSegments = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments);
+      createCurvesTy createCurves = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves);
+      
+      Geometry* geom;
+      switch (type) {
+      case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE            : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+      //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE  : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE            : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE             : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+      case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+      default:                                    geom = nullptr; break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SUBDIVISION:
+    {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+      createSubdivMeshTy createSubdivMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+      //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+      Geometry* geom = createSubdivMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_USER:
+    {
+#if defined(EMBREE_GEOMETRY_USER)
+      createUserGeometryTy createUserGeometry = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry);
+      Geometry* geom = createUserGeometry(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_INSTANCE:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+      createInstanceTy createInstance = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance);
+      Geometry* geom = createInstance(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_GRID:
+    {
+#if defined(EMBREE_GEOMETRY_GRID)
+      createGridMeshTy createGridMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh);
+      Geometry* geom = createGridMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+    }
+    
+    default:
+      throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+    }
+    
+    RTC_CATCH_END(device);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
+
+    geometry->setNumPrimitives(userPrimitiveCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeStepCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+    
+    geometry->setNumTimeSteps(timeStepCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+  {
+    Ref<Geometry> geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeRange);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (startTime > endTime)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+        
+    geometry->setTimeRange(BBox1f(startTime,endTime));
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTopologyCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTopologyCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+ 
+  RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuildQuality);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH &&
+        quality != RTC_BUILD_QUALITY_REFIT)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    geometry->setBuildQuality(quality);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+    RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+    if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+    geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMask);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setMask(mask);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometrySubdivisionMode);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setSubdivisionMode(topologyID,mode);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Buffer> buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hbuffer);
+    
+    if (geometry->device != buffer->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSharedGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetNewGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+    size_t bytes = itemCount*byteStride;
+    if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      bytes += (16 - (byteStride%16))%16;
+      
+    Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    return buffer->data();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryBufferData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+  
+  RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcEnableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->enable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcUpdateGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->updateBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDisableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->disable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTessellationRate);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTessellationRate(tessellationRate);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setUserData(ptr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getUserData();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBoundsFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setBoundsFunction(bounds,userPtr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryDisplacementFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setDisplacementFunction(displacement);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectFunctionN(intersect);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryPointQueryFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setPointQueryFunction(pointQuery);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    return geometry->getFirstHalfEdge(faceID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFace);
+    return geometry->getFace(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    return geometry->getNextHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    return geometry->getPreviousHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    return geometry->getOppositeHalfEdge(topologyID,edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetOccludedFunctionN);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOccludedFunctionN(occluded);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOcclusionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolate(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolateN(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->commit();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+    RTC_CATCH_END2(scene);
+    return -1;
+  }
+
+  RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometryByID);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_GEOMID(geomID);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    scene->bind(geomID,geometry);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDetachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+    scene->detachGeometry(geomID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refInc();
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refDec();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    return (RTCGeometry) scene->get(geomID);
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
new file mode 100644
index 0000000000..4b070e122b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{  
+  /*! decoding of intersection flags */
+  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+  
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
+  
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle)                               \
+  if (handle == nullptr) {                                         \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_GEOMID(id)                                   \
+  if (id == RTC_INVALID_GEOMETRY_ID) {                             \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_UPPER(id,upper)                              \
+  if (id > upper) {                                                \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_RANGE(id,lower,upper)	\
+  if (id < lower || id > upper)						  \
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+  
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x) 
+#endif
+
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define throw_RTCError(error,str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
+  #define throw_RTCError(error,str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..6bb96bba07
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa // FIXME: support more ISAs for builders
+  {
+    struct BVH : public RefCount
+    {
+      BVH (Device* device)
+        : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+      {
+        device->refInc();
+      }
+
+      ~BVH() {
+        device->refDec();
+      }
+
+    public:
+      Device* device;
+      FastAllocator allocator;
+      mvector<BVHBuilderMorton::BuildPrim> morton_src;
+      mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+    };
+
+    void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims_i =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+        
+      std::atomic<size_t> progress(0);
+      
+      /* initialize temporary arrays for morton builder */
+      PrimRef* prims = (PrimRef*) prims_i;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+      morton_src.resize(primitiveCount);
+      morton_tmp.resize(primitiveCount);
+
+      /* compute centroid bounds */
+      const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+          BBox3fa bounds(empty);
+          for (size_t i=r.begin(); i<r.end(); i++) 
+            bounds.extend(prims[i].bounds().center2());
+          return bounds;
+        }, BBox3fa::merge);
+      
+      /* compute morton codes */
+      BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+      parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+          BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            generator(prims[i].bounds(),(unsigned) i);
+          }
+        });
+
+      /* start morton build */
+      std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+        
+        /* lambda function that allocates BVH nodes */
+        [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+          return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+        },
+        
+        /* lambda function that sets bounds */
+        [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+        {
+          BBox3fa bounds = empty;
+          void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) {
+            bounds.extend(children[i].second);
+            childptrs[i] = children[i].first;
+            cbounds[i] = (const RTCBounds*)&children[i].second;
+          }
+          setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+          setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+        {
+	  RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+	  BBox3fa bounds = empty;
+	  for (size_t i=0;i<current.size();i++)
+	    {
+	      const size_t id = morton_src[current.begin()+i].index;
+	      bounds.extend(prims[id].bounds());
+	      localBuildPrims[i] = prims_i[id];
+	    }
+          void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda that calculates the bounds for some primitive */
+        [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+          return prims[morton.index].bounds();
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        morton_src.data(),morton_tmp.data(),primitiveCount,
+        *arguments);
+
+      bvh->allocator.cleanup();
+      return root.first;
+    }
+
+    void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+      auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+        {
+          CentGeomBBox3fa bounds(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            bounds.extend((BBox3fa&)prims[j]);
+          return bounds;
+        };
+      const CentGeomBBox3fa bounds = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+      
+      /* build BVH */
+      void* root = BVHBuilderBinnedSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+
+    static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+      CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+      unsigned int maxGeomID = max(a.second,b.second); 
+      return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+    }
+
+    void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+
+      auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+        {
+          CentGeomBBox3fa bounds(empty);
+          unsigned maxGeomID = 0;
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            bounds.extend((BBox3fa&)prims[j]);
+            maxGeomID = max(maxGeomID,prims[j].geomID);
+          }
+          return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+        };
+
+
+      const std::pair<CentGeomBBox3fa,unsigned int> pair = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+      CentGeomBBox3fa bounds = pair.first;
+      const unsigned int maxGeomID = pair.second;
+      
+      if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+        {
+          /* fallback code for max geomID larger than threshold */
+          return rtcBuildBVHBinnedSAH(arguments);
+        }
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+
+      /* function that splits a build primitive */
+      struct Splitter
+      {
+        Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+          : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+        
+        __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const 
+        {
+          prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+          left_o.geomIDref()  = geomID; left_o.primIDref()  = primID;
+          right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+        }
+
+        __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const 
+        {
+          PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+        }
+   
+        RTCSplitPrimitiveFunction splitPrimitive;
+        unsigned geomID;
+        unsigned primID;
+        void* userPtr;
+      };
+
+      /* build BVH */
+      void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* returns the splitter */
+        [&] ( const PrimRef& prim ) -> Splitter {
+          return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+        },
+
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,
+        arguments->primitiveArrayCapacity,
+        pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+  }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+    RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+    {
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcNewAllocator);
+      RTC_VERIFY_HANDLE(device);
+      BVH* bvh = new BVH((Device*)device);
+      return (RTCBVH) bvh->refInc();
+      RTC_CATCH_END((Device*)device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcBuildBVH);
+      RTC_VERIFY_HANDLE(bvh);
+      RTC_VERIFY_HANDLE(arguments);
+      RTC_VERIFY_HANDLE(arguments->createNode);
+      RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+      RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+      RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+      if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+      /* initialize the allocator */
+      bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+      bvh->allocator.reset();
+
+      /* switch between differnet builders based on quality level */
+      if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+        return rtcBuildBVHMorton(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+        return rtcBuildBVHBinnedSAH(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+        if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+          return rtcBuildBVHBinnedSAH(arguments);
+        else
+          return rtcBuildBVHSpatialSAH(arguments);
+      }
+      else
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+      /* if we are in dynamic mode, then do not clear temporary data */
+      if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+      {
+        bvh->morton_src.clear();
+        bvh->morton_tmp.clear();
+      }
+
+      RTC_CATCH_END(bvh->device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+    {
+      FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcThreadLocalAlloc);
+      return alloc->malloc0(bytes,align);
+      RTC_CATCH_END(alloc->alloc->getDevice());
+      return nullptr;
+    }
+
+    RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcStaticBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->morton_src.clear();
+      bvh->morton_tmp.clear();
+      RTC_CATCH_END(bvh->device);
+    }
+
+    RTC_API void rtcRetainBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcRetainBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refInc();
+      RTC_CATCH_END(device);
+    }
+    
+    RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcReleaseBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refDec();
+      RTC_CATCH_END(device);
+    }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
new file mode 100644
index 0000000000..1e23aeb415
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
@@ -0,0 +1,976 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  /* error raising rtcIntersect and rtcOccluded functions */
+  void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+  void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+  void invalid_rtcIntersect4()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+  void invalid_rtcIntersect8()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+  void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+  void invalid_rtcIntersectN()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+  Scene::Scene (Device* device)
+    : device(device),
+      flags_modified(true), enabled_geometry_types(0),
+      scene_flags(RTC_SCENE_FLAG_NONE),
+      quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+      is_build(false), modified(true),
+      progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+  {
+    device->refInc();
+
+    intersectors = Accel::Intersectors(missing_rtcCommit);
+
+    /* one can overwrite flags through device for debugging */
+    if (device->quality_flags != -1)
+      quality_flags = (RTCBuildQuality) device->quality_flags;
+    if (device->scene_flags != -1)
+      scene_flags = (RTCSceneFlags) device->scene_flags;
+  }
+
+  Scene::~Scene() noexcept
+  {
+    device->refDec();
+  }
+
+  void Scene::printStatistics()
+  {
+    /* calculate maximum number of time segments */
+    unsigned max_time_steps = 0;
+    for (size_t i=0; i<size(); i++) {
+      if (!get(i)) continue;
+      max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+    }
+
+    /* initialize vectors*/
+    std::vector<size_t> statistics[Geometry::GTY_END];
+    for (size_t i=0; i<Geometry::GTY_END; i++)
+      statistics[i].resize(max_time_steps);
+
+    /* gather statistics */
+    for (size_t i=0; i<size(); i++)
+    {
+      if (!get(i)) continue;
+      int ty = get(i)->getType();
+      assert(ty<Geometry::GTY_END);
+      int timesegments = get(i)->numTimeSegments();
+      assert((unsigned int)timesegments < max_time_steps);
+      statistics[ty][timesegments] += get(i)->size();
+    }
+
+    /* print statistics */
+    std::cout << std::setw(23) << "segments" << ": ";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << std::setw(10) << t;
+    std::cout << std::endl;
+
+    std::cout << "-------------------------";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << "----------";
+    std::cout << std::endl;
+
+    for (size_t p=0; p<Geometry::GTY_END; p++)
+    {
+      if (std::string(Geometry::gtype_names[p]) == "") continue;
+      std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+      for (size_t t=0; t<max_time_steps; t++)
+        std::cout << std::setw(10) << statistics[p][t];
+      std::cout << std::endl;
+    }
+  }
+
+  void Scene::createTriangleAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+          break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->tri_accel == "bvh4.triangle4")       accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+    else if (device->tri_accel == "bvh4.triangle4v")      accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+    else if (device->tri_accel == "bvh4.triangle4i")      accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+    else if (device->tri_accel == "qbvh4.triangle4i")     accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel == "bvh8.triangle4")       accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+    else if (device->tri_accel == "bvh8.triangle4v")      accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+    else if (device->tri_accel == "bvh8.triangle4i")      accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4i")     accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4")      accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+  }
+
+  void Scene::createTriangleMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+  }
+
+  void Scene::createQuadAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        /* static */
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          break;
+
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->quad_accel == "bvh4.quad4v")       accels_add(device->bvh4_factory->BVH4Quad4v(this));
+    else if (device->quad_accel == "bvh4.quad4i")       accels_add(device->bvh4_factory->BVH4Quad4i(this));
+    else if (device->quad_accel == "qbvh4.quad4i")      accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel == "bvh8.quad4v")       accels_add(device->bvh8_factory->BVH8Quad4v(this));
+    else if (device->quad_accel == "bvh8.quad4i")       accels_add(device->bvh8_factory->BVH8Quad4i(this));
+    else if (device->quad_accel == "qbvh8.quad4i")      accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+  }
+
+  void Scene::createQuadMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+      switch (mode) {
+      case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        break;
+
+      case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        break;
+
+      case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+      case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+      }
+    }
+    else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+  }
+
+  void Scene::createHairAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel == "default")
+    {
+      int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+  }
+
+  void Scene::createHairMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+      {
+        if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+      else
+#endif
+      {
+        if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+    }
+    else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+  }
+
+  void Scene::createSubdivAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    }
+    else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+  }
+
+  void Scene::createSubdivMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel_mb == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+  }
+
+  void Scene::createUserGeometryAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+  }
+
+  void Scene::createUserGeometryMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel_mb == "default"    ) {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+    }
+    else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createGridAccel()
+  {
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+      else
+#endif
+      {
+        accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+    }
+    else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::createGridMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel_mb == "default")
+    {
+      accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+    }
+    else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::clear() {
+  }
+
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+    if (geomID == RTC_INVALID_GEOMETRY_ID) {
+      geomID = id_pool.allocate();
+      if (geomID == RTC_INVALID_GEOMETRY_ID)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+    }
+    else
+    {
+      if (!id_pool.add(geomID))
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+    }
+    if (geomID >= geometries.size()) {
+      geometries.resize(geomID+1);
+      vertices.resize(geomID+1);
+      geometryModCounters_.resize(geomID+1);
+    }
+    geometries[geomID] = geometry;
+    geometryModCounters_[geomID] = 0;
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    return geomID;
+  }
+
+  void Scene::detachGeometry(size_t geomID)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+
+    if (geomID >= geometries.size())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+    Ref<Geometry>& geometry = geometries[geomID];
+    if (geometry == null)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    accels_deleteGeometry(unsigned(geomID));
+    id_pool.deallocate((unsigned)geomID);
+    geometries[geomID] = null;
+    vertices[geomID] = nullptr;
+    geometryModCounters_[geomID] = 0;
+  }
+
+  void Scene::updateInterface()
+  {
+    is_build = true;
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i)
+        {
+          if (geometries[i] && geometries[i]->isEnabled())
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+
+    /* select acceleration structures to build */
+    unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+    if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+    {
+      accels_init();
+
+      /* we need to make all geometries modified, otherwise two level builder will
+        not rebuild currently not modified geometries */
+      parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+          geometryModCounters_[i] = 0;
+        });
+
+      if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+      if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+      if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+      if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+
+      flags_modified = false;
+      enabled_geometry_types = new_enabled_geometry_types;
+    }
+
+    /* select fast code path if no filter function is present */
+    accels_select(hasFilterFunction());
+
+    /* build all hierarchies of this scene */
+    accels_build();
+
+    /* make static geometry immutable */
+    if (!isDynamicAccel()) {
+      accels_immutable();
+      flags_modified = true; // in non-dynamic mode we have to re-create accels
+    }
+
+    /* call postCommit function of each geometry */
+    parallel_for(geometries.size(), [&] ( const size_t i ) {
+        if (geometries[i] && geometries[i]->isEnabled()) {
+          geometries[i]->postCommit();
+          vertices[i] = geometries[i]->getCompactVertexArray();
+          geometryModCounters_[i] = geometries[i]->getModCounter();
+        }
+      });
+
+    updateInterface();
+
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+
+    setModified(false);
+  }
+
+  void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+  {
+    if (quality_flags == quality_flags_i) return;
+    quality_flags = quality_flags_i;
+    flags_modified = true;
+  }
+
+  RTCBuildQuality Scene::getBuildQuality() const {
+    return quality_flags;
+  }
+
+  void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+  {
+    if (scene_flags == scene_flags_i) return;
+    scene_flags = scene_flags_i;
+    flags_modified = true;
+  }
+
+  RTCSceneFlags Scene::getSceneFlags() const {
+    return scene_flags;
+  }
+
+#if defined(TASKING_INTERNAL)
+
+  void Scene::commit (bool join)
+  {
+    Lock<MutexSys> buildLock(buildMutex,false);
+
+    /* allocates own taskscheduler for each build */
+    Ref<TaskScheduler> scheduler = nullptr;
+    {
+      Lock<MutexSys> lock(schedulerMutex);
+      scheduler = this->scheduler;
+      if (scheduler == null) {
+        buildLock.lock();
+        this->scheduler = scheduler = new TaskScheduler;
+      }
+    }
+
+    /* worker threads join build */
+    if (!buildLock.isLocked())
+    {
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+
+      scheduler->join();
+      return;
+    }
+
+    /* initiate build */
+    // -- GODOT start --
+    // try {
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
+  }
+
+#endif
+
+#if defined(TASKING_TBB) || defined(TASKING_GCD)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+    /* join hierarchy build */
+    if (!lock.isLocked())
+    {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+
+      do {
+
+#if defined(TASKING_GCD)
+      // Do Nothing
+#else
+#if USE_TASK_ARENA
+        if (join) {
+          device->arena->execute([&]{ group.wait(); });
+        }
+        else
+#endif
+        {
+          group.wait();
+        }
+#endif
+
+        pause_cpu();
+        yield();
+
+      } while (!buildMutex.try_lock());
+
+      buildMutex.unlock();
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION_MAJOR < 8
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+      //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+      if (join)
+      {
+        device->arena->execute([&]{
+            group.run([&]{
+                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+              });
+            group.wait();
+          });
+      }
+      else
+#endif
+      {
+        group.run([&]{
+            tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+          });
+        group.wait();
+      }
+
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+#elif defined(TASKING_GCD)
+
+      commit_task();
+
+#endif  // #if defined(TASKING_TBB)
+
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+#if defined(TASKING_PPL)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_PPL)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex);
+
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+
+      group.run([&]{
+          concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+        });
+      group.wait();
+
+       /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
+  {
+    progress_monitor_function = func;
+    progress_monitor_ptr      = ptr;
+  }
+
+  void Scene::progressMonitor(double dn)
+  {
+    if (progress_monitor_function) {
+      size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+      if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+        throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h
new file mode 100644
index 0000000000..b41c6cde91
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+ 
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! Base class all scenes are derived from */
+  class Scene : public AccelN
+  {
+    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+  public:
+    template<typename Ty, bool mblur = false>
+      class Iterator
+      {
+      public:
+      Iterator ()  {}
+      
+      Iterator (Scene* scene, bool all = false) 
+      : scene(scene), all(all) {}
+      
+      __forceinline Ty* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!all && !geom->isEnabled()) return nullptr;
+        const size_t mask = geom->getTypeMask() & Ty::geom_type; 
+        if (!(mask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return (Ty*) geom;
+      }
+
+      __forceinline Ty* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+      __forceinline size_t numPrimitives() const {
+        return scene->getNumPrimitives(Ty::geom_type,mblur);
+      }
+
+      __forceinline size_t maxPrimitivesPerGeometry() 
+      {
+        size_t ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->size());
+        }
+        return ret;
+      }
+
+      __forceinline unsigned int maxGeomID() 
+      {
+        unsigned int ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,(unsigned int)i);
+        }
+        return ret;
+      }
+
+      __forceinline unsigned maxTimeStepsPerGeometry()
+      {
+        unsigned ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->numTimeSteps);
+        }
+        return ret;
+      }
+      
+    private:
+      Scene* scene;
+      bool all;
+      };
+
+      class Iterator2
+      {
+      public:
+      Iterator2 () {}
+      
+      Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) 
+      : scene(scene), typemask(typemask), mblur(mblur) {}
+      
+      __forceinline Geometry* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!geom->isEnabled()) return nullptr;
+        if (!(geom->getTypeMask() & typemask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return geom;
+      }
+
+      __forceinline Geometry* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+    private:
+      Scene* scene;
+      Geometry::GTypeMask typemask;
+      bool mblur;
+    };
+
+  public:
+    
+    /*! Scene construction */
+    Scene (Device* device);
+
+    /*! Scene destruction */
+    ~Scene () noexcept;
+
+  private:
+    /*! class is non-copyable */
+    Scene (const Scene& other) DELETED; // do not implement
+    Scene& operator= (const Scene& other) DELETED; // do not implement
+
+  public:
+    void createTriangleAccel();
+    void createTriangleMBAccel();
+    void createQuadAccel();
+    void createQuadMBAccel();
+    void createHairAccel();
+    void createHairMBAccel();
+    void createSubdivAccel();
+    void createSubdivMBAccel();
+    void createUserGeometryAccel();
+    void createUserGeometryMBAccel();
+    void createInstanceAccel();
+    void createInstanceMBAccel();
+    void createInstanceExpensiveAccel();
+    void createInstanceExpensiveMBAccel();
+    void createGridAccel();
+    void createGridMBAccel();
+
+    /*! prints statistics about the scene */
+    void printStatistics();
+
+    /*! clears the scene */
+    void clear();
+
+    /*! detaches some geometry */
+    void detachGeometry(size_t geomID);
+
+    void setBuildQuality(RTCBuildQuality quality_flags);
+    RTCBuildQuality getBuildQuality() const;
+    
+    void setSceneFlags(RTCSceneFlags scene_flags);
+    RTCSceneFlags getSceneFlags() const;
+    
+    void commit (bool join);
+    void commit_task ();
+    void build () {}
+
+    void updateInterface();
+
+    /* return number of geometries */
+    __forceinline size_t size() const { return geometries.size(); }
+    
+    /* bind geometry to the scene */
+    unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+    
+    /* determines if scene is modified */
+    __forceinline bool isModified() const { return modified; }
+
+    /* sets modified flag */
+    __forceinline void setModified(bool f = true) { 
+      modified = f; 
+    }
+
+    __forceinline bool isGeometryModified(size_t geomID)
+    {
+      Ref<Geometry>& g = geometries[geomID];
+      if (!g) return false;
+      return g->getModCounter() > geometryModCounters_[geomID];
+    }
+
+  protected:
+    
+    __forceinline void checkIfModifiedAndSet () 
+    {
+      if (isModified ()) return;
+      
+      auto geometryIsModified = [this](size_t geomID)->bool {
+        return isGeometryModified(geomID);
+      };
+
+      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+        setModified ();
+      }
+    }
+    
+  public:
+
+    /* get mesh by ID */
+    __forceinline       Geometry* get(size_t i)       { assert(i < geometries.size()); return geometries[i].ptr; }
+    __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+    template<typename Mesh>
+      __forceinline       Mesh* get(size_t i)       { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+    template<typename Mesh>
+      __forceinline const Mesh* get(size_t i) const { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+
+    template<typename Mesh>
+    __forceinline Mesh* getSafe(size_t i) {
+      assert(i < geometries.size());
+      if (geometries[i] == null) return nullptr;
+      if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+      else return (Mesh*) geometries[i].ptr;
+    }
+
+    __forceinline Ref<Geometry> get_locked(size_t i)  {
+      Lock<SpinLock> lock(geometriesMutex);
+      assert(i < geometries.size()); 
+      return geometries[i]; 
+    }
+
+    /* flag decoding */
+    __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+    __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+    __forceinline bool isRobustAccel()  const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+    __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+    __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+    
+    __forceinline bool hasContextFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    }
+    
+    __forceinline bool hasGeometryFilterFunction() {
+      return world.numFilterFunctions != 0;
+    }
+      
+    __forceinline bool hasFilterFunction() {
+      return hasContextFilterFunction() || hasGeometryFilterFunction();
+    }
+    
+    /* test if scene got already build */
+    __forceinline bool isBuild() const { return is_build; }
+
+  public:
+    IDPool<unsigned,0xFFFFFFFE> id_pool;
+    vector<Ref<Geometry>> geometries; //!< list of all user geometries
+    vector<unsigned int> geometryModCounters_;
+    vector<float*> vertices;
+    
+  public:
+    Device* device;
+
+    /* these are to detect if we need to recreate the acceleration structures */
+    bool flags_modified;
+    unsigned int enabled_geometry_types;
+    
+    RTCSceneFlags scene_flags;
+    RTCBuildQuality quality_flags;
+    MutexSys buildMutex;
+    SpinLock geometriesMutex;
+    bool is_build;
+  private:
+    bool modified;                   //!< true if scene got modified
+
+  public:
+    
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL) 
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+    
+  public:
+    struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+      BuildProgressMonitorInterface(Scene* scene) 
+      : scene(scene) {}
+      void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+    private:
+      Scene* scene;
+    };
+    BuildProgressMonitorInterface progressInterface;
+    RTCProgressMonitorFunction progress_monitor_function;
+    void* progress_monitor_ptr;
+    std::atomic<size_t> progress_monitor_counter;
+    void progressMonitor(double nprims);
+    void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+  private:
+    GeometryCounts world;               //!< counts for geometry
+
+  public:
+
+    __forceinline size_t numPrimitives() const {
+      return world.size();
+    }
+
+    __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_TRIANGLE_MESH)
+        count += mblur ? world.numMBTriangles : world.numTriangles;
+      
+      if (mask & Geometry::MTY_QUAD_MESH)
+        count += mblur ? world.numMBQuads : world.numQuads;
+      
+      if (mask & Geometry::MTY_CURVE2)
+        count += mblur ? world.numMBLineSegments : world.numLineSegments;
+      
+      if (mask & Geometry::MTY_CURVE4)
+        count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+      
+      if (mask & Geometry::MTY_POINTS)
+        count += mblur ? world.numMBPoints : world.numPoints;
+      
+      if (mask & Geometry::MTY_SUBDIV_MESH)
+        count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+      
+      if (mask & Geometry::MTY_USER_GEOMETRY)
+        count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+      
+      if (mask & Geometry::MTY_INSTANCE_CHEAP)
+        count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+      
+      if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+        count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBGrids : world.numGrids;
+      
+      return count;
+    }
+    
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned getNumTimeSteps()
+    {
+      if (!mblur)
+        return 1;
+
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxTimeStepsPerGeometry();
+    }
+
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned int getMaxGeomID()
+    {
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxGeomID();
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..2649ab0e3e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
@@ -0,0 +1,341 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of bicubic bezier curves */
+  struct CurveGeometry : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+  public:
+    
+    /*! bezier curve construction */
+    CurveGeometry (Device* device, Geometry::GType gtype);
+    
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+    
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th curve */
+    __forceinline const unsigned int& curve(size_t i) const {
+      return curves[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th tangent of the first time step */
+    __forceinline Vec3ff tangent(size_t i) const {
+      return tangents0[i];
+    }
+
+    /*! returns i'th normal derivative of the first time step */
+    __forceinline Vec3fa dnormal(size_t i) const {
+      return dnormals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th tangent of itime'th timestep */
+    __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+      return tangents[itime][i];
+    }
+
+    /*! returns i'th normal derivative of itime'th timestep */
+    __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+      return dnormals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      n2 = normal(i+2,itime);
+      n3 = normal(i+3,itime);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL1_vertices(size_t i) const
+    {
+      prefetchL1(vertices0.getPtr(i)+0);
+      prefetchL1(vertices0.getPtr(i)+64);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL2_vertices(size_t i) const
+    {
+      prefetchL2(vertices0.getPtr(i)+0);
+      prefetchL2(vertices0.getPtr(i)+64);
+    }  
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3;
+      gather(a0,a1,a2,a3,i,itime);
+      Vec3ff b0,b1,b2,b3;
+      gather(b0,b1,b2,b3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+      gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+      Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+      gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+      n0 = madd(Vec3ff(t0),an0,t1*bn0);
+      n1 = madd(Vec3ff(t0),an1,t1*bn1);
+      n2 = madd(Vec3ff(t0),an2,t1*bn2);
+      n3 = madd(Vec3ff(t0),an3,t1*bn3);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+      unsigned int vertexID = curve(primID);
+      gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+      SourceCurve3ff ccurve(v0,v1,v2,v3);
+      SourceCurve3fa ncurve(n0,n1,n2,n3);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1;
+      gather_hermite(ap0,at0,ap1,at1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1;
+      gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      dn0 = dnormal(i+0);
+      dn1 = dnormal(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      dn0 = dnormal(i+0,itime);
+      dn1 = dnormal(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+      gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+      gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      n0 = madd(Vec3ff(f0),an0,f1*bn0);
+      dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+      n1 = madd(Vec3ff(f0),an1,f1*bn1);
+      dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+      unsigned int vertexID = curve(primID);
+      gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+      SourceCurve3ff ccurve(v0,t0,v1,t1);
+      SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+  private:
+    void resizeBuffers(unsigned int numSteps);
+
+  public:
+    BufferView<unsigned int> curves;        //!< array of curve indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
+    BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
+    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for flat curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+  
+  DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..c08658466a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Grid Mesh */
+  struct GridMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+    /*! grid */
+    struct Grid 
+    {
+      unsigned int startVtxID;
+      unsigned int lineVtxOffset;
+      unsigned short resX,resY;
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+      {
+        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+      }
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+      {
+        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+      }
+
+      /*! outputs grid structure */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+      }
+    };
+
+  public:
+
+    /*! grid mesh construction */
+    GridMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    {
+      const Grid &g = grid(gridID);
+      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th grid*/
+    __forceinline const Grid& grid(size_t i) const {
+      return grids[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th vertex of the first timestep */
+    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+      assert(x < (size_t)g.resX);
+      assert(y < (size_t)g.resY);
+      return g.startVtxID + x + y * g.lineVtxOffset;
+    }
+    
+    /*! returns i'th vertex of the first timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index);
+    }
+
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,itime);
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+    {
+      assert(itime < numTimeSteps);
+      BBox3fa b0(empty);
+      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+        {
+          const Vec3fa v = grid_vertex(g,x,y,itime);
+          if (unlikely(!isvalid(v))) return false;
+          b0.extend(v);
+        }
+
+      /* use bounds of first time step in builder */
+      bbox = b0;
+      return true;
+    }
+
+    __forceinline bool valid(size_t gridID, size_t itime=0) const {
+      return valid(gridID, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+    {
+      if (unlikely(gridID >= grids.size())) return false;
+      const Grid &g = grid(gridID);
+      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
+      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+      for (size_t y=0;y<g.resY;y++)
+        for (size_t x=0;x<g.resX;x++)
+          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+      return true;
+    }
+
+
+    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+    {
+      BBox3fa box(empty);
+      buildBounds(g,sx,sy,itime,box);
+      return box;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+      BBox3fa bounds0, bounds1;
+      buildBounds(g,sx,sy,itime+0,bounds0);
+      buildBounds(g,sx,sy,itime+1,bounds1);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+  public:
+    BufferView<Grid> grids;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct GridMeshISA : public GridMesh
+    {
+      GridMeshISA (Device* device)
+        : GridMesh(device) {}
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..7ff82a4fb8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct Instance : public Geometry
+  {
+    ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+  public:
+    Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+    ~Instance();
+
+  private:
+    Instance (const Instance& other) DELETED; // do not implement
+    Instance& operator= (const Instance& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+                         lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+      return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+                        lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+    }
+
+  public:
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+    virtual AffineSpace3fa getTransform(float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+      return xfmBounds(local2world[0],object->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t itime) const {
+      return object->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+      return xfmBounds(local2world[itime],getObjectBounds(itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      assert(i == 0);
+      LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      assert(i==0);
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      assert(i==0);
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      assert(i == 0);
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World() const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(local2world[0]);
+      return local2world[0];
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(float t) const
+    {
+      float ftime; const unsigned int itime = timeSegment(t, ftime);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return slerp(local2world[itime+0],local2world[itime+1],ftime);
+      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local() const {
+      return world2local0;
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(float t) const {
+      return rcp(getLocal2World(t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp(valid, t);
+      return getWorld2LocalLerp(valid, t);
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+                         AffineSpace3vff<K>(local2world[itime+1]),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+                        AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  public:
+    Accel* object;                 //!< pointer to instanced acceleration structure
+    AffineSpace3ff* local2world;   //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    AffineSpace3fa world2local0;   //!< transformation from world space to local space for timestep 0
+  };
+
+  namespace isa
+  {
+    struct InstanceISA : public Instance
+    {
+      InstanceISA (Device* device)
+        : Instance(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // const BBox3fa b = bounds(0);
+        // if (!isvalid(b)) return pinfo;
+
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // if (!valid(0,range<size_t>(itime))) return pinfo;
+        // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfoMB pinfo(empty);
+        if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+        const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+        pinfo.add_primref(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..c0f9ee8f77
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
@@ -0,0 +1,307 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of line segments */
+  struct LineSegments : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+  public:
+
+    /*! line segments construction */
+    LineSegments (Device* device, Geometry::GType gtype);
+
+  public:
+    void setMask (unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify ();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th segment */
+    __forceinline const unsigned int& segment(size_t i) const {
+      return segments[i];
+    }
+
+    /*! returns the segment to the left of the i'th segment */
+    __forceinline bool segmentLeftExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+    }
+
+    /*! returns the segment to the right of the i'th segment */
+    __forceinline bool segmentRightExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+    }
+
+     /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+    {
+      const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+      return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = segment(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+        const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+        if (min(v0.w,v1.w) < 0.0f) return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i,0)) return false;
+      *bbox = bounds(i); 
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+      bbox = bounds(i,itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+    BufferView<unsigned int> segments;      //!< array of line segment indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for bezier curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct LineSegmentsISA : public LineSegments
+    {
+      LineSegmentsISA (Device* device, Geometry::GType gtype)
+        : LineSegments(device,gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0);
+        const Vec3fa v1 = vertex(vtxID+1);
+        return v1-v0;
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0,time);
+        const Vec3fa v1 = vertex(vtxID+1,time);
+        return v1-v0;
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const {
+        return bounds(i);
+      }
+      
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+        return bounds(space,i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+      
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(space,primID,time_range);
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h
new file mode 100644
index 0000000000..1d39ed07ba
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! represents an array of points */
+  struct Points : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+   public:
+    /*! line segments construction */
+    Points(Device* device, Geometry::GType gtype);
+
+   public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps(unsigned int numTimeSteps);
+    void setVertexAttributeCount(unsigned int N);
+    void setBuffer(RTCBufferType type,
+                   unsigned int slot,
+                   RTCFormat format,
+                   const Ref<Buffer>& buffer,
+                   size_t offset,
+                   size_t stride,
+                   unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+   public:
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+      return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = (unsigned int)i;
+      if (index >= numVertices())
+        return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+        const Vec3ff v0 = vertex(index + 0, itime);
+        if (unlikely(!isvalid4(v0)))
+          return false;
+        if (v0.w < 0.0f)
+          return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i, 0))
+        return false;
+      *bbox = bounds(i);
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i, itime + 0) || !valid(i, itime + 1))
+        return false;
+      bbox = bounds(i, itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+   public:
+    BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
+    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct PointsISA : public Points
+    {
+      PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+                                      const BBox1f& t0t1,
+                                      const range<size_t>& r,
+                                      size_t k,
+                                      unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const
+      {
+        return bounds(i);
+      }
+
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+      {
+        return bounds(space, i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(primID, time_range);
+      }
+
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(space, primID, time_range);
+      }
+    };
+  }  // namespace isa
+
+  DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..d5bb054b14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Quad Mesh */
+  struct QuadMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+    
+    /*! triangle indices */
+    struct Quad
+    {
+      uint32_t v[4];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+        return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+      }
+    };
+
+  public:
+
+    /*! quad mesh construction */
+    QuadMesh (Device* device); 
+  
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th quad */
+    __forceinline const Quad& quad(size_t i) const {
+      return quads[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th quad */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! calculates the bounds of the i'th quad at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0],itime);
+      const Vec3fa v1 = vertex(q.v[1],itime);
+      const Vec3fa v2 = vertex(q.v[2],itime);
+      const Vec3fa v3 = vertex(q.v[3],itime);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(q.v[0],itime))) return false;
+        if (!isvalid(vertex(q.v[1],itime))) return false;
+        if (!isvalid(vertex(q.v[2],itime))) return false;
+        if (!isvalid(vertex(q.v[3],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Quad& q = quad(i);
+      if (q.v[0] >= numVertices()) return false;
+      if (q.v[1] >= numVertices()) return false;
+      if (q.v[2] >= numVertices()) return false;
+      if (q.v[3] >= numVertices()) return false;
+
+      for (unsigned int t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(q.v[0],t);
+        const Vec3fa v1 = vertex(q.v[1],t);
+        const Vec3fa v2 = vertex(q.v[2],t);
+        const Vec3fa v3 = vertex(q.v[3],t);
+
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+          return false;
+      }
+
+      if (bbox) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+      const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return quads.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return quads.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return areaProjectedTriangle(v0,v1,v3) +
+	areaProjectedTriangle(v1,v2,v3);
+    }
+
+  public:
+    BufferView<Quad> quads;                 //!< array of quads
+    BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+  };
+
+  namespace isa
+  {
+    struct QuadMeshISA : public QuadMesh
+    {
+      QuadMeshISA (Device* device)
+        : QuadMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..d0246009db
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+  class SubdivMesh : public Geometry
+  {
+    ALIGNED_CLASS_(16);
+  public:
+
+    typedef HalfEdge::Edge Edge;
+    
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+    /*! structure used to sort half edges using radix sort by their key */
+    struct KeyHalfEdge 
+    {
+      KeyHalfEdge() {}
+      
+      KeyHalfEdge (uint64_t key, HalfEdge* edge) 
+      : key(key), edge(edge) {}
+      
+      __forceinline operator uint64_t() const { 
+	return key; 
+      }
+
+      friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+        return e0.key < e1.key;
+      }
+      
+    public:
+      uint64_t key;
+      HalfEdge* edge;
+    };
+
+  public:
+
+    /*! subdiv mesh construction */
+    SubdivMesh(Device* device);
+
+  public:
+    void setMask (unsigned mask);
+    void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+    void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setTopologyCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void setTessellationRate(float N);
+    bool verify();
+    void commit();
+    void addElementsToCount (GeometryCounts & counts) const;
+    void setDisplacementFunction (RTCDisplacementFunctionN func);
+    unsigned int getFirstHalfEdge(unsigned int faceID);
+    unsigned int getFace(unsigned int edgeID);
+    unsigned int getNextHalfEdge(unsigned int edgeID);
+    unsigned int getPreviousHalfEdge(unsigned int edgeID);
+    unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+  public:
+
+    /*! return the number of faces */
+    size_t numFaces() const { 
+      return faceVertices.size(); 
+    }
+
+    /*! return the number of edges */
+    size_t numEdges() const { 
+      return topology[0].vertexIndices.size(); 
+    }
+
+    /*! return the number of vertices */
+    size_t numVertices() const { 
+      return vertices[0].size(); 
+    }
+
+    /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+      return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+    }
+
+    /*! check if the i'th primitive is valid */
+    __forceinline bool valid(size_t i) const {
+      return topology[0].valid(i) && !invalidFace(i);
+    }
+
+    /*! check if the i'th primitive is valid for the j'th time range */
+    __forceinline bool valid(size_t i, size_t j) const {
+      return topology[0].valid(i) && !invalidFace(i,j);
+    }
+
+    /*! prints some statistics */
+    void printStatistics();
+
+    /*! initializes the half edge data structure */
+    void initializeHalfEdgeStructures ();
+ 
+  public:
+
+    /*! returns the vertex buffer for some time step */
+    __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+      return vertices[t];
+    }
+
+    /* returns tessellation level of edge */
+    __forceinline float getEdgeLevel(const size_t i) const
+    {
+      if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+      else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+    }
+
+  public:
+    RTCDisplacementFunctionN displFunc;    //!< displacement function
+
+    /*! all buffers in this section are provided by the application */
+  public:
+    
+    /*! the topology contains all data that may differ when
+     *  interpolating different user data buffers */
+    struct Topology
+    {
+    public:
+
+      /*! Default topology construction */
+      Topology () : halfEdges(nullptr,0) {}
+
+      /*! Topology initialization */
+      Topology (SubdivMesh* mesh);
+
+      /*! make the class movable */
+    public: 
+      Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+        : mesh(std::move(other.mesh)), 
+          vertexIndices(std::move(other.vertexIndices)),
+          subdiv_mode(std::move(other.subdiv_mode)),
+          halfEdges(std::move(other.halfEdges)),
+          halfEdges0(std::move(other.halfEdges0)),
+          halfEdges1(std::move(other.halfEdges1)) {}
+      
+      Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+      {
+        mesh = std::move(other.mesh); 
+        vertexIndices = std::move(other.vertexIndices);
+        subdiv_mode = std::move(other.subdiv_mode);
+        halfEdges = std::move(other.halfEdges);
+        halfEdges0 = std::move(other.halfEdges0);
+        halfEdges1 = std::move(other.halfEdges1);
+        return *this;
+      }
+
+    public:
+      /*! check if the i'th primitive is valid in this topology */
+      __forceinline bool valid(size_t i) const 
+      {
+        if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+          if (getHalfEdge(i)->faceHasBorder()) return false;
+        }
+        return true;
+      }
+      
+      /*! updates the interpolation mode for the topology */
+      void setSubdivisionMode (RTCSubdivisionMode mode);
+
+      /*! marks all buffers as modified */
+      void update ();
+
+      /*! verifies index array */
+      bool verify (size_t numVertices);
+
+      /*! initializes the half edge data structure */
+      void initializeHalfEdgeStructures ();
+
+    private:
+      
+      /*! recalculates the half edges */
+      void calculateHalfEdges();
+      
+      /*! updates half edges when recalculation is not necessary */
+      void updateHalfEdges();
+      
+      /*! user input data */
+    public:
+
+      SubdivMesh* mesh;
+
+      /*! indices of the vertices composing each face */
+      BufferView<unsigned int> vertexIndices;
+      
+      /*! subdiv interpolation mode */
+      RTCSubdivisionMode subdiv_mode;
+
+      /*! generated data */
+    public:
+
+      /*! returns the start half edge for face f */
+      __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { 
+        return &halfEdges[mesh->faceStartEdge[f]]; 
+      }
+
+      /*! Half edge structure, generated by initHalfEdgeStructures */
+      mvector<HalfEdge> halfEdges;
+
+      /*! the following data is only required during construction of the
+       *  half edge structure and can be cleared for static scenes */
+    private:
+      
+      /*! two arrays used to sort the half edges */
+      std::vector<KeyHalfEdge> halfEdges0;
+      std::vector<KeyHalfEdge> halfEdges1;
+    };
+
+    /*! returns the start half edge for topology t and face f */
+    __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { 
+      return topology[t].getHalfEdge(f);
+    }
+
+    /*! buffer containing the number of vertices for each face */
+    BufferView<unsigned int> faceVertices;
+
+    /*! array of topologies */
+    vector<Topology> topology;
+
+    /*! vertex buffer (one buffer for each time step) */
+    vector<BufferView<Vec3fa>> vertices;
+
+    /*! user data buffers */
+    vector<RawBufferView> vertexAttribs;
+
+    /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+    BufferView<Edge> edge_creases;
+    
+    /*! edge crease weights for each edge of the edge_creases buffer */
+    BufferView<float> edge_crease_weights;
+    
+    /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+    BufferView<unsigned int> vertex_creases;
+    
+    /*! vertex crease weights for each vertex of the vertex_creases buffer */
+    BufferView<float> vertex_crease_weights;
+
+    /*! subdivision level for each half edge of the vertexIndices buffer */
+    BufferView<float> levels;
+    float tessellationRate;  // constant rate that is used when levels is not set
+
+    /*! buffer that marks specific faces as holes */
+    BufferView<unsigned> holes;
+
+    /*! all data in this section is generated by initializeHalfEdgeStructures function */
+  private:
+
+    /*! number of half edges used by faces */
+    size_t numHalfEdges; 
+
+    /*! fast lookup table to find the first half edge for some face */
+    mvector<uint32_t> faceStartEdge;
+
+    /*! fast lookup table to find the face for some half edge */
+    mvector<uint32_t> halfEdgeFace;
+
+    /*! set with all holes */
+    parallel_set<uint32_t> holeSet;
+
+    /*! fast lookup table to detect invalid faces */
+    mvector<int8_t> invalid_face;
+
+    /*! test if face i is invalid in timestep j */
+    __forceinline       int8_t& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+    /*! interpolation cache */
+  public:
+    static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+    static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+    static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+      const size_t slots = numInterpolationSlots4(stride); 
+      assert(slot < slots); 
+      return slots*prim+slot;
+    }
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+    std::vector<Patch3fa::Ref> patch_eval_trees;
+    
+    /*! the following data is only required during construction of the
+     *  half edge structure and can be cleared for static scenes */
+  private:
+
+    /*! map with all vertex creases */
+    parallel_map<uint32_t,float> vertexCreaseMap;
+    
+    /*! map with all edge creases */
+    parallel_map<uint64_t,float> edgeCreaseMap;
+
+  protected:
+    
+    /*! counts number of geometry commits */
+    size_t commitCounter;
+  };
+
+  namespace isa
+  {
+    struct SubdivMeshISA : public SubdivMesh
+    {
+      SubdivMeshISA (Device* device)
+        : SubdivMesh(device) {}
+
+      void interpolate(const RTCInterpolateArguments* const args);
+      void interpolateN(const RTCInterpolateNArguments* const args);
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..d1c2750f14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,243 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+  TriangleMesh::TriangleMesh (Device* device)
+    : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+  {
+    vertices.resize(numTimeSteps);
+  }
+
+  void TriangleMesh::setMask (unsigned mask) 
+  {
+    this->mask = mask; 
+    Geometry::update();
+  }
+
+  void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+  {
+    vertices.resize(numTimeSteps);
+    Geometry::setNumTimeSteps(numTimeSteps);
+  }
+
+  void TriangleMesh::setVertexAttributeCount (unsigned int N)
+  {
+    vertexAttribs.resize(N);
+    Geometry::update();
+  }
+  
+  void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+  {
+    /* verify that all accesses are 4 bytes aligned */
+    if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+    if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (format != RTC_FORMAT_FLOAT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+      /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+      if (stride*num > 16ll*1024ll*1024ll*1024ll)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+      vertices[slot].set(buffer, offset, stride, num, format);
+      vertices[slot].checkPadding16();
+      vertices0 = vertices[0];
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+      
+      vertexAttribs[slot].set(buffer, offset, stride, num, format);
+      vertexAttribs[slot].checkPadding16();
+    }
+    else if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      if (format != RTC_FORMAT_UINT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+      triangles.set(buffer, offset, stride, num, format);
+      setNumPrimitives(num);
+    }
+    else 
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+  }
+
+  void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return triangles.getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertices[slot].getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertexAttribs[slot].getPtr();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+      return nullptr;
+    }
+  }
+
+  void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      triangles.setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertices[slot].setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertexAttribs[slot].setModified();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+    }
+
+    Geometry::update();
+  }
+
+  void TriangleMesh::commit() 
+  {
+    /* verify that stride of all time steps are identical */
+    for (unsigned int t=0; t<numTimeSteps; t++)
+      if (vertices[t].getStride() != vertices[0].getStride())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+    Geometry::commit();
+  }
+
+  void TriangleMesh::addElementsToCount (GeometryCounts & counts) const 
+  {
+    if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+    else                   counts.numMBTriangles += numPrimitives;
+  }
+
+  bool TriangleMesh::verify() 
+  {
+    /*! verify size of vertex arrays */
+    if (vertices.size() == 0) return false;
+    for (const auto& buffer : vertices)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify size of user vertex arrays */
+    for (const auto& buffer : vertexAttribs)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify triangle indices */
+    for (size_t i=0; i<size(); i++) {     
+      if (triangles[i].v[0] >= numVertices()) return false; 
+      if (triangles[i].v[1] >= numVertices()) return false; 
+      if (triangles[i].v[2] >= numVertices()) return false; 
+    }
+
+    /*! verify vertices */
+    for (const auto& buffer : vertices)
+      for (size_t i=0; i<buffer.size(); i++)
+	if (!isvalid(buffer[i])) 
+	  return false;
+
+    return true;
+  }
+  
+  void TriangleMesh::interpolate(const RTCInterpolateArguments* const args)
+  {
+    unsigned int primID = args->primID;
+    float u = args->u;
+    float v = args->v;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    /* calculate base pointer and stride */
+    assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+           (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+    const char* src = nullptr; 
+    size_t stride = 0;
+    if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+      src    = vertexAttribs[bufferSlot].getPtr();
+      stride = vertexAttribs[bufferSlot].getStride();
+    } else {
+      src    = vertices[bufferSlot].getPtr();
+      stride = vertices[bufferSlot].getStride();
+    }
+    
+    for (unsigned int i=0; i<valueCount; i+=4)
+    {
+      size_t ofs = i*sizeof(float);
+      const float w = 1.0f-u-v;
+      const Triangle& tri = triangle(primID);
+      const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount));
+      const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+      const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+      const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+      
+      if (P) {
+        vfloat4::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+      }
+      if (dPdu) {
+        assert(dPdu); vfloat4::storeu(valid,dPdu+i,p1-p0);
+        assert(dPdv); vfloat4::storeu(valid,dPdv+i,p2-p0);
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero));
+        assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero));
+        assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero));
+      }
+    }
+  }
+  
+#endif
+  
+  namespace isa
+  {
+    TriangleMesh* createTriangleMesh(Device* device) {
+      return new TriangleMeshISA(device);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..eaf2e1799a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,264 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Triangle Mesh */
+  struct TriangleMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+    /*! triangle indices */
+    struct Triangle 
+    {
+      uint32_t v[3];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+        return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+      }
+    };
+
+  public:
+
+    /*! triangle mesh construction */
+    TriangleMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th triangle*/
+    __forceinline const Triangle& triangle(size_t i) const {
+      return triangles[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th triangle */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0],itime);
+      const Vec3fa v1 = vertex(tri.v[1],itime);
+      const Vec3fa v2 = vertex(tri.v[2],itime);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(tri.v[0],itime))) return false;
+        if (!isvalid(vertex(tri.v[1],itime))) return false;
+        if (!isvalid(vertex(tri.v[2],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(tri.v[0],t);
+        const Vec3fa v1 = vertex(tri.v[1],t);
+        const Vec3fa v2 = vertex(tri.v[2],t);
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+          return false;
+      }
+
+      if (likely(bbox)) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const  {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return triangles.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);      
+      return areaProjectedTriangle(v0,v1,v2);
+    }
+
+  public:
+    BufferView<Triangle> triangles;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct TriangleMeshISA : public TriangleMesh
+    {
+      TriangleMeshISA (Device* device)
+        : TriangleMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..8d11ed6986
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+  /*! User geometry with user defined intersection functions */
+  struct UserGeometry : public AccelSet
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+  public:
+    UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+    virtual void setMask (unsigned mask);
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const;
+  };
+
+  namespace isa
+  {
+    struct UserGeometryISA : public UserGeometry
+    {
+      UserGeometryISA (Device* device)
+        : UserGeometry(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+  
+  DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h
new file mode 100644
index 0000000000..533c385365
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! An item on the stack holds the node ID and distance of that node. */
+  template<typename T>
+  struct __aligned(16) StackItemT
+  {
+    /*! assert that the xchg function works */
+    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+    __forceinline StackItemT() {}
+
+    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
+    { 
+      const vfloat4 sse_a = vfloat4::load((float*)&a); 
+      const vfloat4 sse_b = vfloat4::load((float*)&b);
+      vfloat4::store(&a,sse_b);
+      vfloat4::store(&b,sse_a);
+    }
+
+    /*! Sort 2 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s4.dist < s3.dist) xchg(s4,s3);
+      if (s3.dist < s1.dist) xchg(s3,s1);
+      if (s4.dist < s2.dist) xchg(s4,s2);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+    }
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
+    { 
+#if defined(__AVX512VL__)
+      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+      const vboolf4 mask0(b < a);
+      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+      const vint4 c = select(mask,b,a);
+      const vint4 d = select(mask,a,b);
+      a = c;
+      b = d;
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s3,s2);
+      cmp_xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s4,s3);
+      cmp_xchg(s3,s1);
+      cmp_xchg(s4,s2);
+      cmp_xchg(s3,s2);
+    }
+
+
+    /*! Sort N stack items. */
+    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+    {
+      for (StackItemT* i = begin+1; i != end; ++i)
+      {
+        const vfloat4 item = vfloat4::load((float*)i);
+        const unsigned dist = i->dist;
+        StackItemT* j = i;
+
+        while ((j != begin) && ((j-1)->dist < dist))
+        {
+          vfloat4::store(j, vfloat4::load((float*)(j-1)));
+          --j;
+        }
+
+        vfloat4::store(j, item);
+      }
+    }
+    
+  public:
+    T ptr;
+    unsigned dist;
+  };
+
+  /*! An item on the stack holds the node ID and active ray mask. */
+  template<typename T>
+  struct __aligned(8) StackItemMaskT
+  {
+    T ptr;
+    size_t mask;
+  };
+
+  struct __aligned(8) StackItemMaskCoherent
+  {
+    size_t mask;
+    size_t parent;
+    size_t child;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp
new file mode 100644
index 0000000000..b73c3a8c76
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+  Stat Stat::instance; 
+  
+  Stat::Stat () {
+  }
+
+  Stat::~Stat () 
+  {
+#ifdef EMBREE_STAT_COUNTERS
+    Stat::print(std::cout);
+#endif
+  }
+
+  void Stat::print(std::ostream& cout)
+  {
+    Counters& cntrs = instance.cntrs;
+    Counters::Data& data = instance.cntrs.code;
+    //Counters::Data& data = instance.cntrs.active;
+
+    /* print absolute numbers */
+    cout << "--------- ABSOLUTE ---------" << std::endl;
+    cout << "  #normal_travs   = " << float(data.normal.travs            )*1E-6 << "M" << std::endl;
+    cout << "    #nodes        = " << float(data.normal.trav_nodes       )*1E-6 << "M" << std::endl;
+    cout << "    #nodes_xfm    = " << float(data.normal.trav_xfm_nodes   )*1E-6 << "M" << std::endl;
+    cout << "    #leaves       = " << float(data.normal.trav_leaves      )*1E-6 << "M" << std::endl;
+    cout << "    #prims        = " << float(data.normal.trav_prims       )*1E-6 << "M" << std::endl;
+    cout << "    #prim_hits    = " << float(data.normal.trav_prim_hits   )*1E-6 << "M" << std::endl;
+
+    cout << "    #stack nodes  = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+    cout << "    #stack pop    = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+    size_t normal_box_hits = 0;
+    size_t weighted_box_hits = 0;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) { 
+      normal_box_hits += data.normal.trav_hit_boxes[i];
+      weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+    }
+    cout << "    #hit_boxes    = " << normal_box_hits << " (total) distribution: ";
+    float average = 0.0f;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) 
+    {
+      float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+      cout << "[" << i << "] " << value << " ";
+      average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+    }
+    cout << "    average = " << average << std::endl;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+    cout << std::endl;
+
+    if (data.shadow.travs) {
+      cout << "  #shadow_travs = " << float(data.shadow.travs         )*1E-6 << "M" << std::endl;
+      cout << "    #nodes      = " << float(data.shadow.trav_nodes    )*1E-6 << "M" << std::endl;
+      cout << "    #nodes_xfm  = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+      cout << "    #leaves     = " << float(data.shadow.trav_leaves   )*1E-6 << "M" << std::endl;
+      cout << "    #prims      = " << float(data.shadow.trav_prims    )*1E-6 << "M" << std::endl;
+      cout << "    #prim_hits  = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+      cout << "    #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+      cout << "    #stack pop   = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+      size_t shadow_box_hits = 0;
+      size_t weighted_shadow_box_hits = 0;
+
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) {        
+        shadow_box_hits += data.shadow.trav_hit_boxes[i];
+        weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+      }
+      cout << "    #hit_boxes    = ";
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+      cout << std::endl;
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+      cout << std::endl;
+    }
+    cout << std::endl;
+
+    /* print per traversal numbers */
+    cout << "--------- PER TRAVERSAL ---------" << std::endl;
+    float active_normal_travs       = float(cntrs.active.normal.travs      )/float(cntrs.all.normal.travs      );
+    float active_normal_trav_nodes  = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+    float active_normal_trav_xfm_nodes  = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+    float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+    float active_normal_trav_prims   = float(cntrs.active.normal.trav_prims  )/float(cntrs.all.normal.trav_prims  );
+    float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits  )/float(cntrs.all.normal.trav_prim_hits  );
+    float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop  )/float(cntrs.all.normal.trav_stack_pop  );
+
+    cout << "  #normal_travs   = " << float(cntrs.code.normal.travs      )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs       << "% active" << std::endl;
+    cout << "    #nodes        = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes  << "% active" << std::endl;
+    cout << "    #node_xfm     = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes  << "% active" << std::endl;
+    cout << "    #leaves       = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+    cout << "    #prims        = " << float(cntrs.code.normal.trav_prims  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims   << "% active" << std::endl;
+    cout << "    #prim_hits    = " << float(cntrs.code.normal.trav_prim_hits  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits   << "% active" << std::endl;
+    cout << "    #stack_pop    = " << float(cntrs.code.normal.trav_stack_pop  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop   << "% active" << std::endl;
+
+    if (cntrs.all.shadow.travs) {
+      float active_shadow_travs       = float(cntrs.active.shadow.travs      )/float(cntrs.all.shadow.travs      );
+      float active_shadow_trav_nodes  = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+      float active_shadow_trav_xfm_nodes  = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+      float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+      float active_shadow_trav_prims   = float(cntrs.active.shadow.trav_prims  )/float(cntrs.all.shadow.trav_prims  );
+      float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits  )/float(cntrs.all.shadow.trav_prim_hits  );
+
+      cout << "  #shadow_travs = " << float(cntrs.code.shadow.travs      )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs       << "% active" << std::endl;
+      cout << "    #nodes      = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes  << "% active" << std::endl;
+      cout << "    #nodes_xfm  = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes  << "% active" << std::endl;
+      cout << "    #leaves     = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+      cout << "    #prims      = " << float(cntrs.code.shadow.trav_prims  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims   << "% active" << std::endl;
+      cout << "    #prim_hits  = " << float(cntrs.code.shadow.trav_prim_hits  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits   << "% active" << std::endl;
+
+    }
+    cout << std::endl;
+
+     /* print user counters for performance tuning */
+    cout << "--------- USER ---------" << std::endl;
+    for (size_t i=0; i<10; i++)
+      cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+    cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.h b/thirdparty/embree-aarch64/kernels/common/stat.h
new file mode 100644
index 0000000000..3cda2bd014
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+#  define STAT(x) x
+#  define STAT3(s,x,y,z) \
+  STAT(Stat::get().code  .s+=x);               \
+  STAT(Stat::get().active.s+=y);               \
+  STAT(Stat::get().all   .s+=z);
+#  define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+#  define STAT(x)
+#  define STAT3(s,x,y,z)
+#  define STAT_USER(i,x) 
+#endif
+
+namespace embree
+{
+  /*! Gathers ray tracing statistics. We count 1) how often a code
+   *  location is reached, 2) how many SIMD lanes are active, 3) how
+   *  many SIMD lanes reach the code location */
+  class Stat
+  { 
+  public:
+
+    static const size_t SIZE_HISTOGRAM = 64+1;
+
+    /*! constructs stat counter class */
+    Stat ();
+
+    /*! destructs stat counter class */
+    ~Stat ();
+
+    class Counters 
+    {
+    public:
+      Counters () { 
+        clear(); 
+      }
+      
+      void clear() 
+      { 
+        all.clear();
+        active.clear();
+        code.clear();
+        for (auto& u : user) u.store(0);
+      }
+
+    public:
+
+	/* per packet and per ray stastics */
+	struct Data
+        {
+          void clear () {
+            normal.clear();
+            shadow.clear();
+            point_query.clear();
+          }
+
+	  /* normal and shadow ray statistics */
+	  struct 
+          {
+            void clear() 
+            {
+              travs.store(0);
+              trav_nodes.store(0);
+              trav_leaves.store(0);
+              trav_prims.store(0);
+              trav_prim_hits.store(0);
+              for (auto& v : trav_hit_boxes) v.store(0);
+              trav_stack_pop.store(0);
+              trav_stack_nodes.store(0); 
+              trav_xfm_nodes.store(0); 
+            }
+
+          public:
+	    std::atomic<size_t> travs;
+	    std::atomic<size_t> trav_nodes;
+	    std::atomic<size_t> trav_leaves;
+	    std::atomic<size_t> trav_prims;
+	    std::atomic<size_t> trav_prim_hits;
+	    std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+	    std::atomic<size_t> trav_stack_pop;
+	    std::atomic<size_t> trav_stack_nodes; 
+            std::atomic<size_t> trav_xfm_nodes; 
+            
+	  } normal, shadow, point_query;
+	} all, active, code; 
+
+        std::atomic<size_t> user[10];
+    };
+
+  public:
+
+    static __forceinline Counters& get() {
+      return instance.cntrs;
+    }
+    
+    static void clear() {
+      instance.cntrs.clear();
+    }
+    
+    static void print(embree_ostream cout);
+
+  private: 
+    Counters cntrs;
+
+  private:
+    static Stat instance;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp
new file mode 100644
index 0000000000..51fc9b7826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+  MutexSys g_printMutex;
+
+  State::ErrorHandler State::g_errorHandler;
+
+  State::ErrorHandler::ErrorHandler()
+    : thread_error(createTls()) {}
+
+  State::ErrorHandler::~ErrorHandler()
+  {
+    Lock<MutexSys> lock(errors_mutex);
+    for (size_t i=0; i<thread_errors.size(); i++)
+      delete thread_errors[i];
+    destroyTls(thread_error);
+    thread_errors.clear();
+  }
+
+  RTCError* State::ErrorHandler::error() 
+  {
+    RTCError* stored_error = (RTCError*) getTls(thread_error);
+    if (stored_error) return stored_error;
+
+    Lock<MutexSys> lock(errors_mutex);
+    stored_error = new RTCError(RTC_ERROR_NONE);
+    thread_errors.push_back(stored_error);
+    setTls(thread_error,stored_error);
+    return stored_error;
+  }
+
+  State::State () 
+    : enabled_cpu_features(getCPUFeatures()),
+      enabled_builder_cpu_features(enabled_cpu_features),
+      frequency_level(FREQUENCY_SIMD256)
+  {
+    tri_accel = "default";
+    tri_builder = "default";
+    tri_traverser = "default";
+    
+    tri_accel_mb = "default";
+    tri_builder_mb = "default";
+    tri_traverser_mb = "default";
+
+    quad_accel = "default";
+    quad_builder = "default";
+    quad_traverser = "default";
+
+    quad_accel_mb = "default";
+    quad_builder_mb = "default";
+    quad_traverser_mb = "default";
+
+    line_accel = "default";
+    line_builder = "default";
+    line_traverser = "default";
+
+    line_accel_mb = "default";
+    line_builder_mb = "default";
+    line_traverser_mb = "default";
+    
+    hair_accel = "default";
+    hair_builder = "default";
+    hair_traverser = "default";
+
+    hair_accel_mb = "default";
+    hair_builder_mb = "default";
+    hair_traverser_mb = "default";
+
+    object_accel = "default";
+    object_builder = "default";
+    object_accel_min_leaf_size = 1;
+    object_accel_max_leaf_size = 1;
+
+    object_accel_mb = "default";
+    object_builder_mb = "default";
+    object_accel_mb_min_leaf_size = 1;
+    object_accel_mb_max_leaf_size = 1;
+
+    max_spatial_split_replications = 1.2f;
+    useSpatialPreSplits = false;
+
+    tessellation_cache_size = 128*1024*1024;
+
+    subdiv_accel = "default";
+    subdiv_accel_mb = "default";
+
+    grid_accel = "default";
+    grid_builder = "default";
+    grid_accel_mb = "default";
+    grid_builder_mb = "default";
+
+    instancing_open_min = 0;
+    instancing_block_size = 0;
+    instancing_open_factor = 8.0f; 
+    instancing_open_max_depth = 32;
+    instancing_open_max = 50000000;
+
+    ignore_config_files = false;
+    float_exceptions = false;
+    quality_flags = -1;
+    scene_flags = -1;
+    verbose = 0;
+    benchmark = 0;
+
+    numThreads = 0;
+    numUserThreads = 0;
+
+#if TASKING_INTERNAL
+    set_affinity = true;
+#else
+    set_affinity = false;
+#endif
+    /* per default enable affinity on KNL */
+    if (hasISA(AVX512KNL)) set_affinity = true;
+
+    start_threads = false;
+    enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+    hugepages = true;
+#else
+    hugepages = false;
+#endif
+    hugepages_success = true;
+
+    alloc_main_block_size = 0;
+    alloc_num_main_slots = 0;
+    alloc_thread_block_size = 0;
+    alloc_single_thread_alloc = -1;
+
+    error_function = nullptr;
+    error_function_userptr = nullptr;
+
+    memory_monitor_function = nullptr;
+    memory_monitor_userptr = nullptr;
+  }
+
+  State::~State() {
+  }
+
+  bool State::hasISA(const int isa) {
+    return (enabled_cpu_features & isa) == isa;
+  }
+
+  bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
+    return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
+  }
+  
+  void State::verify()
+  {
+    /* verify that calculations stay in range */
+    assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+    /* here we verify that CPP files compiled for a specific ISA only
+     * call that same or lower ISA version of non-inlined class member
+     * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
+    assert(sse2::getISA() <= SSE2);
+#endif
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512KNL)
+    assert(avx512knl::getISA() <= AVX512KNL);
+#endif
+#if defined (EMBREE_TARGET_AVX512SKX)
+    assert(avx512skx::getISA() <= AVX512SKX);
+#endif
+#endif
+  }
+
+  const char* symbols[3] = { "=", ",", "|" };
+
+  bool State::parseFile(const FileName& fileName)
+  {
+    FILE* f = fopen(fileName.c_str(),"r");
+    if (!f) return false;
+    Ref<Stream<int> > file = new FileStream(f,fileName);
+    
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+    return true;
+  }
+
+  void State::parseString(const char* cfg)
+  {
+    if (cfg == nullptr) return;
+
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+  }
+  
+  int string_to_cpufeatures(const std::string& isa)
+  {
+    if      (isa == "sse" ) return SSE;
+    else if (isa == "sse2") return SSE2;
+    else if (isa == "sse3") return SSE3;
+    else if (isa == "ssse3") return SSSE3;
+    else if (isa == "sse41") return SSE41;
+    else if (isa == "sse4.1") return SSE41;
+    else if (isa == "sse42") return SSE42;
+    else if (isa == "sse4.2") return SSE42;
+    else if (isa == "avx") return AVX;
+    else if (isa == "avxi") return AVXI;
+    else if (isa == "avx2") return AVX2;
+    else if (isa == "avx512knl") return AVX512KNL;
+    else if (isa == "avx512skx") return AVX512SKX;
+    else return SSE2;
+  }
+
+  void State::parse(Ref<TokenStream> cin)
+  {
+    /* parse until end of stream */
+    while (cin->peek() != Token::Eof())
+    {
+      const Token tok = cin->get();
+
+      if (tok == Token::Id("threads") && cin->trySymbol("=")) 
+        numThreads = cin->get().Int();
+
+      else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) 
+        numUserThreads = cin->get().Int();
+
+      else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+
+      else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+      
+      else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) 
+        start_threads = cin->get().Int();
+      
+      else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features = string_to_cpufeatures(isa);
+        enabled_builder_cpu_features = enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features &= string_to_cpufeatures(isa);
+        enabled_builder_cpu_features &= enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_builder_cpu_features &= string_to_cpufeatures(isa);
+      }
+
+      else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+        std::string freq = cin->get().Identifier();
+        if      (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+        else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+        else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+      }
+
+      else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+        enable_selockmemoryprivilege = cin->get().Int();
+      }
+      else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+        hugepages = cin->get().Int();
+      }
+
+      else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("="))
+        ignore_config_files = cin->get().Int();
+      else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) 
+        float_exceptions = cin->get().Int();
+
+      else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+        tri_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+        tri_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+        tri_traverser = cin->get().Identifier();
+     
+      else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+        tri_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+        tri_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+        tri_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+        quad_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+        quad_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+        quad_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+        quad_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+        quad_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+        quad_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+        line_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+        line_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+        line_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+        line_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+        line_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+        line_traverser_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+        hair_accel = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+        hair_builder = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+        hair_traverser = cin->get().Identifier();
+
+      else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+        hair_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+        hair_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+        hair_traverser_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+        object_accel = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+        object_builder = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+        object_accel_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+        object_accel_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+        object_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+        object_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+        instancing_open_min = cin->get().Int();
+      else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+        instancing_block_size = cin->get().Int();
+        instancing_open_factor = 0.0f;
+      }
+      else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+        instancing_open_max_depth = cin->get().Int();
+      else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+        instancing_block_size = 0;
+        instancing_open_factor = cin->get().Float();
+      }
+      else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+        instancing_open_max = cin->get().Int();
+
+      else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+        subdiv_accel = cin->get().Identifier();
+      else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+        subdiv_accel_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+        grid_accel = cin->get().Identifier();
+      else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+        grid_accel_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+        verbose = cin->get().Int();
+      else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+        benchmark = cin->get().Int();
+      
+      else if (tok == Token::Id("quality")) {
+        if (cin->trySymbol("=")) {
+          Token flag = cin->get();
+          if      (flag == Token::Id("low"))    quality_flags = RTC_BUILD_QUALITY_LOW;
+          else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+          else if (flag == Token::Id("high"))   quality_flags = RTC_BUILD_QUALITY_HIGH;
+        }
+      }
+
+      else if (tok == Token::Id("scene_flags")) {
+        scene_flags = 0;
+        if (cin->trySymbol("=")) {
+          do {
+            Token flag = cin->get();
+            if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+            else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+            else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+          } while (cin->trySymbol("|"));
+        }
+      }
+      
+      else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+        max_spatial_split_replications = cin->get().Float();
+
+      else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+        useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+      else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+      else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+      else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+        alloc_main_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+        alloc_num_main_slots = cin->get().Int();
+       else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+         alloc_thread_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+         alloc_single_thread_alloc = cin->get().Int();
+
+      cin->trySymbol(","); // optional , separator
+    }
+  }
+
+  bool State::verbosity(size_t N) {
+    return N <= verbose;
+  }
+
+  void State::print()
+  {
+    std::cout << "general:" << std::endl;
+    std::cout << "  build threads      = " << numThreads   << std::endl;
+    std::cout << "  build user threads = " << numUserThreads   << std::endl;
+    std::cout << "  start_threads      = " << start_threads << std::endl;
+    std::cout << "  affinity           = " << set_affinity << std::endl;
+    std::cout << "  frequency_level    = ";
+    switch (frequency_level) {
+    case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+    case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+    case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+    default: std::cout << "error" << std::endl; break;
+    }
+    
+    std::cout << "  hugepages          = ";
+    if (!hugepages) std::cout << "disabled" << std::endl;
+    else if (hugepages_success) std::cout << "enabled" << std::endl;
+    else std::cout << "failed" << std::endl;
+
+    std::cout << "  verbosity          = " << verbose << std::endl;
+    std::cout << "  cache_size         = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+    std::cout << "  max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+    
+    std::cout << "triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel << std::endl;
+    std::cout << "  builder            = " << tri_builder << std::endl;
+    std::cout << "  traverser          = " << tri_traverser << std::endl;
+        
+    std::cout << "motion blur triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel_mb << std::endl;
+    std::cout << "  builder            = " << tri_builder_mb << std::endl;
+    std::cout << "  traverser          = " << tri_traverser_mb << std::endl;
+
+    std::cout << "quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel << std::endl;
+    std::cout << "  builder            = " << quad_builder << std::endl;
+    std::cout << "  traverser          = " << quad_traverser << std::endl;
+
+    std::cout << "motion blur quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel_mb << std::endl;
+    std::cout << "  builder            = " << quad_builder_mb << std::endl;
+    std::cout << "  traverser          = " << quad_traverser_mb << std::endl;
+
+    std::cout << "line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel << std::endl;
+    std::cout << "  builder            = " << line_builder << std::endl;
+    std::cout << "  traverser          = " << line_traverser << std::endl;
+
+    std::cout << "motion blur line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel_mb << std::endl;
+    std::cout << "  builder            = " << line_builder_mb << std::endl;
+    std::cout << "  traverser          = " << line_traverser_mb << std::endl;
+    
+    std::cout << "hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel << std::endl;
+    std::cout << "  builder            = " << hair_builder << std::endl;
+    std::cout << "  traverser          = " << hair_traverser << std::endl;
+
+    std::cout << "motion blur hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel_mb << std::endl;
+    std::cout << "  builder            = " << hair_builder_mb << std::endl;
+    std::cout << "  traverser          = " << hair_traverser_mb << std::endl;
+    
+    std::cout << "subdivision surfaces:" << std::endl;
+    std::cout << "  accel              = " << subdiv_accel << std::endl;
+
+    std::cout << "grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel << std::endl;
+    std::cout << "  builder            = " << grid_builder << std::endl;
+
+    std::cout << "motion blur grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel_mb << std::endl;
+    std::cout << "  builder            = " << grid_builder_mb << std::endl;
+
+    std::cout << "object_accel:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_max_leaf_size << std::endl;
+
+    std::cout << "object_accel_mb:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_mb_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_mb_max_leaf_size << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h
new file mode 100644
index 0000000000..d0fccc023f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* mutex to make printing to cout thread safe */
+  extern MutexSys g_printMutex;
+
+  struct State : public RefCount
+  {
+  public:
+    /*! state construction */
+    State ();
+
+    /*! state destruction */
+    ~State();
+
+    /*! verifies that state is correct */
+    void verify();
+
+    /*! parses state from a configuration file */
+    bool parseFile(const FileName& fileName);
+
+    /*! parses the state from a string */
+    void parseString(const char* cfg);
+
+    /*! parses the state from a stream */
+    void parse(Ref<TokenStream> cin);
+
+    /*! prints the state */
+    void print();
+
+    /*! checks if verbosity level is at least N */
+    bool verbosity(size_t N);
+
+    /*! checks if some particular ISA is enabled */
+    bool hasISA(const int isa);
+
+    /*! check whether selected ISA is supported by the HW */    
+    bool checkISASupport();
+    
+  public:
+    std::string tri_accel;                 //!< acceleration structure to use for triangles
+    std::string tri_builder;               //!< builder to use for triangles
+    std::string tri_traverser;             //!< traverser to use for triangles
+    
+  public:
+    std::string tri_accel_mb;              //!< acceleration structure to use for motion blur triangles
+    std::string tri_builder_mb;            //!< builder to use for motion blur triangles
+    std::string tri_traverser_mb;          //!< traverser to use for triangles
+
+  public:
+    std::string quad_accel;                 //!< acceleration structure to use for quads
+    std::string quad_builder;               //!< builder to use for quads
+    std::string quad_traverser;             //!< traverser to use for quads
+
+  public:
+    std::string quad_accel_mb;             //!< acceleration structure to use for motion blur quads
+    std::string quad_builder_mb;           //!< builder to use for motion blur quads
+    std::string quad_traverser_mb;         //!< traverser to use for motion blur quads
+
+  public:
+    std::string line_accel;                 //!< acceleration structure to use for line segments
+    std::string line_builder;               //!< builder to use for line segments
+    std::string line_traverser;             //!< traverser to use for line segments
+
+  public:
+    std::string line_accel_mb;             //!< acceleration structure to use for motion blur line segments
+    std::string line_builder_mb;           //!< builder to use for motion blur line segments
+    std::string line_traverser_mb;         //!< traverser to use for motion blur line segments
+
+  public:
+    std::string hair_accel;                //!< hair acceleration structure to use
+    std::string hair_builder;              //!< builder to use for hair
+    std::string hair_traverser;            //!< traverser to use for hair
+
+  public:
+    std::string hair_accel_mb;             //!< acceleration structure to use for motion blur hair
+    std::string hair_builder_mb;           //!< builder to use for motion blur hair
+    std::string hair_traverser_mb;         //!< traverser to use for motion blur hair
+
+  public:
+    std::string object_accel;               //!< acceleration structure for user geometries
+    std::string object_builder;             //!< builder for user geometries
+    int object_accel_min_leaf_size;         //!< minimum leaf size for object acceleration structure
+    int object_accel_max_leaf_size;         //!< maximum leaf size for object acceleration structure
+
+  public:
+    std::string object_accel_mb;            //!< acceleration structure for user geometries
+    std::string object_builder_mb;          //!< builder for user geometries
+    int object_accel_mb_min_leaf_size;      //!< minimum leaf size for mblur object acceleration structure
+    int object_accel_mb_max_leaf_size;      //!< maximum leaf size for mblur object acceleration structure
+
+  public:
+    std::string subdiv_accel;              //!< acceleration structure to use for subdivision surfaces
+    std::string subdiv_accel_mb;           //!< acceleration structure to use for subdivision surfaces
+
+  public:
+    std::string grid_accel;              //!< acceleration structure to use for grids
+    std::string grid_builder;            //!< builder for grids
+    std::string grid_accel_mb;           //!< acceleration structure to use for motion blur grids
+    std::string grid_builder_mb;         //!< builder for motion blur grids
+
+  public:
+    float max_spatial_split_replications;  //!< maximally replications*N many primitives in accel for spatial splits
+    bool useSpatialPreSplits;              //!< use spatial pre-splits instead of the full spatial split builder
+    size_t tessellation_cache_size;        //!< size of the shared tessellation cache 
+
+  public:
+    size_t instancing_open_min;            //!< instancing opens tree to minimally that number of subtrees
+    size_t instancing_block_size;          //!< instancing opens tree up to average block size of primitives
+    float  instancing_open_factor;         //!< instancing opens tree up to x times the number of instances
+    size_t instancing_open_max_depth;      //!< maximum open depth for geometries
+    size_t instancing_open_max;            //!< instancing opens tree to maximally that number of subtrees
+
+  public:
+    bool ignore_config_files;              //!< if true no more config files get parse
+    bool float_exceptions;                 //!< enable floating point exceptions
+    int quality_flags;
+    int scene_flags;
+    size_t verbose;                        //!< verbosity of output
+    size_t benchmark;                      //!< true
+    
+  public:
+    size_t numThreads;                     //!< number of threads to use in builders
+    size_t numUserThreads;                 //!< number of user provided threads to use in builders
+    bool set_affinity;                     //!< sets affinity for worker threads
+    bool start_threads;                    //!< true when threads should be started at device creation time
+    int enabled_cpu_features;              //!< CPU ISA features to use
+    int enabled_builder_cpu_features;      //!< CPU ISA features to use for builders only
+    enum FREQUENCY_LEVEL {
+      FREQUENCY_SIMD128,
+      FREQUENCY_SIMD256,
+      FREQUENCY_SIMD512
+    } frequency_level;                     //!< frequency level the app wants to run on (default is SIMD256)
+    bool enable_selockmemoryprivilege;     //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+    bool hugepages;                        //!< true if huge pages should get used
+    bool hugepages_success;                //!< status for enabling huge pages
+
+  public:
+    size_t alloc_main_block_size;          //!< main allocation block size (shared between threads)
+    int alloc_num_main_slots;              //!< number of such shared blocks to be used to allocate
+    size_t alloc_thread_block_size;        //!< size of thread local allocator block size
+    int alloc_single_thread_alloc;         //!< in single mode nodes and leaves use same thread local allocator
+
+  public:
+
+    /*! checks if we can use AVX */
+    bool canUseAVX() {
+      return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+    }
+
+    /*! checks if we can use AVX2 */
+    bool canUseAVX2() {
+      return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+    }
+    
+    struct ErrorHandler
+    {
+    public:
+      ErrorHandler();
+      ~ErrorHandler();
+      RTCError* error();
+
+    public:
+      tls_t thread_error;
+      std::vector<RTCError*> thread_errors;
+      MutexSys errors_mutex;
+    };
+    ErrorHandler errorHandler;
+    static ErrorHandler g_errorHandler;
+
+  public:
+    void setErrorFunction(RTCErrorFunction fptr, void* uptr) 
+    {
+      error_function = fptr;
+      error_function_userptr = uptr;
+    }
+
+    RTCErrorFunction error_function;
+    void* error_function_userptr;
+
+  public:
+    void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) 
+    {
+      memory_monitor_function = fptr;
+      memory_monitor_userptr = uptr;
+    }
+      
+    RTCMemoryMonitorFunction memory_monitor_function;
+    void* memory_monitor_userptr;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h
new file mode 100644
index 0000000000..b478762240
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+  /*! invokes the memory monitor callback */
+  struct MemoryMonitorInterface {
+    virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+  };
+
+  /*! allocator that performs aligned monitored allocations */
+  template<typename T, size_t alignment = 64>
+    struct aligned_monitored_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) 
+        : device(device), hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) 
+      {
+        if (n) {
+          assert(device);
+          device->memoryMonitor(n*sizeof(T),false);
+        }
+        if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+        {
+          pointer p =  (pointer) os_malloc(n*sizeof(value_type),hugepages);
+          assert(p);
+          return p;
+        }
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) 
+      {
+        if (p)
+        {
+          if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+            os_free(p,n*sizeof(value_type),hugepages); 
+          else
+            alignedFree(p);
+        }
+        else assert(n == 0);
+
+        if (n) {
+          assert(device);
+          device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+        }
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+    private:
+      MemoryMonitorInterface* device;
+      bool hugepages;
+    };
+
+  /*! monitored vector */
+  template<typename T>
+    using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}
diff --git a/thirdparty/embree-aarch64/kernels/config.h b/thirdparty/embree-aarch64/kernels/config.h
new file mode 100644
index 0000000000..80a8ab2a56
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/config.h
@@ -0,0 +1,76 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cone.h b/thirdparty/embree-aarch64/kernels/geometry/cone.h
new file mode 100644
index 0000000000..961ef86160
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cone.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cone
+    {
+      const Vec3fa p0; //!< start position of cone
+      const Vec3fa p1; //!< end position of cone
+      const float r0;  //!< start radius of cone
+      const float r1;  //!< end radius of cone
+
+      __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o, 
+                                   float& u1_o, Vec3fa& Ng1_o) const 
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3fa v0 = p0-org;
+        const Vec3fa v1 = p1-org;
+        
+        const float rl = rcp_length(v1-v0);
+        const Vec3fa P0 = v0, dP = (v1-v0)*rl;
+        const float dr = (r1-r0)*rl;
+        const Vec3fa O = -P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+
+        const float R = r0 + Oz*dr;          
+        const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr));
+        const float B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const float C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) return false;
+
+        /* special case for rays that are "parallel" to the cone */
+        const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (unlikely(abs(A) < eps))
+        {
+          /* cylinder case */
+          if (abs(dr) < 16.0f*float(ulp)) {
+            if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } 
+            else           { t_o = BBox1f(pos_inf,neg_inf); return false; }
+          }
+
+          /* cone case */
+          else 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const float t = -C/B;
+            const float z0 = Oz+t*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+
+            /* test if we start inside or outside the cone */
+            if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf);
+            else               t_o = BBox1f(neg_inf,t);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        else
+        {
+          const float Q = sqrt(D);
+          const float rcp_2A = rcp(2.0f*A);
+          t_o.lower = (-B-Q)*rcp_2A;
+          t_o.upper = (-B+Q)*rcp_2A;
+          
+          /* standard case where both hits are on same cone */
+          if (likely(A > 0.0f)) {
+            const float z0 = Oz+t_o.lower*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          else 
+          {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            if (dOz*dr > 0) t_o.upper = pos_inf;
+            else            t_o.lower = neg_inf;
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3fa Pr = t_o.lower*dir;
+          const Vec3fa Pl = v0 + u0_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3fa Pr = t_o.upper*dir;
+          const Vec3fa Pl = v0 + u1_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const 
+      {
+        float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cone.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cone class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf);
+        passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f);
+        passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf);
+        passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f);
+        passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6);
+        passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f);
+        const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f);
+        passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f);
+        passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f);
+        const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) {
+        return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}";
+      }
+    };
+
+    template<int N>
+      struct ConeN
+    {
+      typedef Vec3<vfloat<N>> Vec3vfN;
+      
+      const Vec3vfN p0;     //!< start position of cone
+      const Vec3vfN p1;     //!< end position of cone
+      const vfloat<N> r0;   //!< start radius of cone
+      const vfloat<N> r1;   //!< end radius of cone
+
+      __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline Cone operator[] (const size_t i) const
+      {
+        assert(i<N);
+        return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]);
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vfN& Ng0_o, 
+                                       vfloat<N>& u1_o, Vec3vfN& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3vfN v0 = p0-Vec3vfN(org);
+        const Vec3vfN v1 = p1-Vec3vfN(org);
+
+        const vfloat<N> rl = rcp_length(v1-v0);
+        const Vec3vfN P0 = v0, dP = (v1-v0)*rl;
+        const vfloat<N> dr = (r1-r0)*rl;
+        const Vec3vfN O = -P0, dO = dir;
+       
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> R = r0 + Oz*dr;          
+        const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr));
+        const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const vfloat<N> C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) return valid;
+
+        /* special case for rays that are "parallel" to the cone */
+        const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        const vbool<N> validt = valid &  (abs(A) < eps);
+        const vbool<N> validf = valid & !(abs(A) < eps);
+        if (unlikely(any(validt)))
+        {
+          const vboolx validtt = validt & (abs(dr) <  16.0f*float(ulp));
+          const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp));
+          
+          /* cylinder case */
+          if (unlikely(any(validtt))) 
+          {
+            t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower);
+            t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper);
+            valid &= !validtt | C <= 0.0f;
+          }
+
+          /* cone case */
+          if (any(validtf)) 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const vfloat<N> t = -C/B;
+            const vfloat<N> z0 = Oz+t*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validtf | z0r >= 0.0f;
+
+            /* test if we start inside or outside the cone */
+            t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower);
+            t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        if (likely(any(validf)))
+        {
+          const vfloat<N> Q = sqrt(D);
+          const vfloat<N> rcp_2A = 0.5f*rcp(A);
+          t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower);
+          t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper);
+          
+          /* standard case where both hits are on same cone */
+          const vbool<N> validft = validf &   A>0.0f;
+          const vbool<N> validff = validf & !(A>0.0f);
+          if (any(validft)) {
+            const vfloat<N> z0 = Oz+t_o.lower*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validft | z0r >= 0.0f;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          if (any(validff)) {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower);
+            t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper);
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u0_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u1_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return valid;
+      }
+ 
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
new file mode 100644
index 0000000000..0902baff7d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    namespace __coneline_internal 
+    {
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectCone(const vbool<M>& valid_i,
+                                                const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                const vbool<M>& cL, const vbool<M>& cR,
+                                                const Epilog& epilog)
+      {   
+        vbool<M> valid = valid_i;
+
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+        const Vec3vf<M> dP = v1.xyz() - v0.xyz();
+        const Vec3vf<M> p0 = ray_org - v0.xyz();
+        const Vec3vf<M> p1 = ray_org - v1.xyz();
+        
+        const vfloat<M> dPdP  = sqr(dP);
+        const vfloat<M> dP0   = dot(p0,dP);
+        const vfloat<M> dP1   = dot(p1,dP); 
+        const vfloat<M> dOdP  = dot(ray_dir,dP);
+
+        // intersect cone body
+        const vfloat<M> dr  = v0.w - v1.w;
+        const vfloat<M> hy  = dPdP + sqr(dr);
+        const vfloat<M> dO0 = dot(ray_dir,p0);
+        const vfloat<M> OO  = sqr(p0);
+        const vfloat<M> dPdP2 = sqr(dPdP);
+        const vfloat<M> dPdPr0 = dPdP*v0.w;
+        
+        const vfloat<M> A = dPdP2     - sqr(dOdP)*hy;
+        const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy   + dPdPr0*(dr*dOdP);
+        const vfloat<M> C = dPdP2*OO  - sqr(dP0)*hy   + dPdPr0*(2.0f*dr*dP0 - dPdPr0);
+        
+        const vfloat<M> D = B*B - A*C;
+        valid &= D >= 0.0f;
+        if (unlikely(none(valid))) {
+          return false;
+        }
+
+        /* standard case for "non-parallel" rays */
+        const vfloat<M> Q = sqrt(D);
+        const vfloat<M> rcp_A = rcp(A);
+        /* special case for rays that are "parallel" to the cone - assume miss */
+        const vbool<M> isParallel = abs(A) <= min_rcp_input;
+
+        vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A);
+        vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A);
+        const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP;
+        const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP;
+        t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf);
+        t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf);
+
+        const vbool<M> hitDisk0 = valid & cL;
+        const vbool<M> hitDisk1 = valid & cR;
+        const vfloat<M> rcp_dOdP = rcp(dOdP);
+        const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk_lower = min(t_disk0, t_disk1);
+        const vfloat<M> t_disk_upper = max(t_disk0, t_disk1);
+
+        const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower);
+        const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, 
+                                                      select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper), 
+                                                      select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower)));
+
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf);
+
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        const vfloat<M> t_first = select(valid_lower, t_lower, t_upper);
+        const vfloat<M> y_first = select(valid_lower, y_lower, y_upper);
+
+        const vfloat<M> rcp_dPdP = rcp(dPdP);
+        const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP;
+        const Vec3vf<M> dPhy = dP*hy;
+        const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper);
+        const vbool<M> disk0_hit_first = valid & (t_first == t_disk0);
+        const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP));
+        const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_upper;
+        const vfloat<M> y_second = y_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> disk0_hit_second = t_second == t_disk0;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP));
+        const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    }
+
+    template<int M>
+      struct ConeLineIntersectorHitM
+      {
+        __forceinline ConeLineIntersectorHitM() {}
+        
+        __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct ConeCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          return  __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct ConeCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
new file mode 100644
index 0000000000..d47218eb8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "coneline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct ConeCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct ConeCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct ConeCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct ConeCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
new file mode 100644
index 0000000000..51384f1959
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNi
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue");
+      return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNi () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {  
+      size_t end = min(begin+M,_end);
+      N = (uint8_t)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (uint8_t) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        bounds.extend(scene->get(geomID)->vbounds(primID));
+      }
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRef& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
+        
+        bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
+        bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
+
+        bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
+        bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
+        bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = CurveNi::blocks(set.size());
+      size_t numbytes = CurveNi::bytes(set.size());
+      CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((int8_t*)accel,items);
+    };
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+25*M+16];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
+      short bounds_vx_lower[N];
+      short bounds_vx_upper[N];
+      
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
+      short bounds_vy_lower[N];
+      short bounds_vy_upper[N];
+      
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
+      short bounds_vz_lower[N];
+      short bounds_vz_upper[N];
+      
+      Vec3f offset;
+      float scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+    
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+    
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+    
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+    
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+11*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
+    
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+12*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
+    
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+13*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
+    
+    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((int8_t*)this+6+14*N); }
+    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
+    
+    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((int8_t*)this+6+16*N); }
+    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
+    
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+18*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
+    
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+19*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
+    
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+20*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
+    
+    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((int8_t*)this+6+21*N); }
+    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
+    
+    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((int8_t*)this+6+23*N); }
+    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
+    
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+25*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+25*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
+
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+25*N+16; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
+  };
+
+  template<int M>
+    typename CurveNi<M>::Type CurveNi<M>::type;
+
+  typedef CurveNi<4> Curve4i;
+  typedef CurveNi<8> Curve8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
new file mode 100644
index 0000000000..0f9038c9fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
@@ -0,0 +1,569 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct CurveNiIntersector1
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+         
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiIntersectorK
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+      
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
new file mode 100644
index 0000000000..0cd8f833fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
@@ -0,0 +1,278 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNiMB
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue");
+      return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNiMB () {}
+
+    /*! fill curve from curve list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
+    {
+      size_t end = min(begin+M,_end);
+      N = (uint8_t)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (uint8_t) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      LBBox3fa lbounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRefMB& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range));
+      }
+      BBox3fa bounds = lbounds.bounds();
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      this->time_offset(N) = time_range.lower;
+      this->time_scale(N) = 1.0f/time_range.size();
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
+        
+        // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
+        bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
+        bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
+        bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
+        
+        bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
+        bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
+        bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
+        bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
+        bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+      
+      return lbounds;
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = CurveNiMB::blocks(prims.size());
+      size_t numbytes = CurveNiMB::bytes(prims.size());
+      CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+37*M+24];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
+      short bounds_vx_lower0[N];
+      short bounds_vx_upper0[N];
+      short bounds_vx_lower1[N];
+      short bounds_vx_upper1[N];
+      
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
+      short bounds_vy_lower0[N];
+      short bounds_vy_upper0[N];
+      short bounds_vy_lower1[N];
+      short bounds_vy_upper1[N];
+      
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
+      short bounds_vz_lower0[N];
+      short bounds_vz_upper0[N];
+      short bounds_vz_lower1[N];
+      short bounds_vz_upper1[N];
+      
+      Vec3f offset;
+      float scale;
+
+      float time_offset;
+      float time_scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+    
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+    
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+    
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+
+    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((int8_t*)this+6+11*N); }
+    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
+    
+    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((int8_t*)this+6+13*N); }
+    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
+
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+15*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
+    
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+16*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
+    
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+17*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
+    
+    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((int8_t*)this+6+18*N); }
+    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
+    
+    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((int8_t*)this+6+20*N); }
+    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
+
+    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((int8_t*)this+6+22*N); }
+    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
+    
+    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((int8_t*)this+6+24*N); }
+    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
+    
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+26*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
+    
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+27*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
+    
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+28*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
+    
+    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((int8_t*)this+6+29*N); }
+    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
+    
+    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((int8_t*)this+6+31*N); }
+    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
+
+    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((int8_t*)this+6+33*N); }
+    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
+    
+    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((int8_t*)this+6+35*N); }
+    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
+
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+37*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+37*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
+
+    __forceinline       float& time_offset(size_t N)       { return *(float*)((int8_t*)this+6+37*N+16); }
+    __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
+    
+    __forceinline       float& time_scale(size_t N)       { return *(float*)((int8_t*)this+6+37*N+20); }
+    __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
+
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+37*N+24; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
+  };
+
+  template<int M>
+    typename CurveNiMB<M>::Type CurveNiMB<M>::type;
+
+  typedef CurveNiMB<4> Curve4iMB;
+  typedef CurveNiMB<8> Curve8iMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
new file mode 100644
index 0000000000..0cbc764668
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
@@ -0,0 +1,516 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi_mb.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNiMBIntersector1
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiMBIntersectorK
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
new file mode 100644
index 0000000000..6eb5e30b39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNv : public CurveNi<M>
+  {
+    using CurveNi<M>::N;
+      
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue");
+      return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNv () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {
+      size_t end = min(begin+M,_end);
+      size_t N = end-begin;
+
+      /* encode all primitives */
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID);
+        const unsigned vtxID = mesh->curve(primID);
+        Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0));
+        Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1));
+        Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2));
+        Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3));
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      if (set.size() == 0)
+        return BVH::emptyNode;
+      
+      /* fall back to CurveNi for oriented curves */
+      unsigned int geomID = prims[set.begin()].geomID();
+      if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      
+      size_t start = set.begin();
+      size_t items = CurveNv::blocks(set.size());
+      size_t numbytes = CurveNv::bytes(set.size());
+      CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene);
+        accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    unsigned char data[4*16*M];
+    __forceinline       Vec3fa* vertices(size_t i, size_t N)       { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+    __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+  };
+
+  template<int M>
+    typename CurveNv<M>::Type CurveNv<M>::type;
+
+  typedef CurveNv<4> Curve4v;
+  typedef CurveNv<8> Curve8v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
new file mode 100644
index 0000000000..e20da2882e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNv.h"
+#include "curveNi_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNvIntersector1 : public CurveNiIntersector1<M>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
new file mode 100644
index 0000000000..204958f7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "bezier_hair_intersector.h"
+#include "bezier_ribbon_intersector.h"
+#include "bezier_curve_intersector.h"
+#include "oriented_curve_intersector.h"
+#include "../bvh/node_intersector1.h"
+
+// FIXME: this file seems replicate of curve_intersector_virtual.h
+
+namespace embree
+{
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+      
+      template<int N, int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
new file mode 100644
index 0000000000..343cc8ff28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3fa, int M>
+    struct DistanceCurveHit
+    {
+      __forceinline DistanceCurveHit() {}
+
+      __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                                     const NativeCurve3fa& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3fa curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+    
+    template<typename NativeCurve3fa>
+    struct DistanceCurve1Intersector1
+    {
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        
+        /* transform control points into ray space */
+        const NativeCurve3fa curve3Di(v0,v1,v2,v3);
+        const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
+        const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
+      
+        /* evaluate the bezier curve */
+        vboolx valid = vfloatx(step) < vfloatx(float(N));
+        const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+        const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+
+        /* approximative intersection with cone */
+        const Vec4vfx v = p1-p0;
+        const Vec4vfx w = -p0;
+        const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+        const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+        const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+        const Vec4vfx p = madd(u,v,p0);
+        const vfloatx t = p.z*pre.depth_scale;
+        const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+        const vfloatx r = p.w;
+        const vfloatx r2 = r*r;
+        valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+          valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+        /* update hit information */
+        bool ishit = false;
+        if (unlikely(any(valid))) {
+          DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+          ishit = ishit | epilog(valid,hit);
+        }
+
+        if (unlikely(VSIZEX < N)) 
+        {
+          /* process SIMD-size many segments per iteration */
+          for (int i=VSIZEX; i<N; i+=VSIZEX)
+          {
+            /* evaluate the bezier curve */
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+            const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+            
+            /* approximative intersection with cone */
+            const Vec4vfx v = p1-p0;
+            const Vec4vfx w = -p0;
+            const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+            const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+            const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+            const Vec4vfx p = madd(u,v,p0);
+            const vfloatx t = p.z*pre.depth_scale;
+            const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+            const vfloatx r = p.w;
+            const vfloatx r2 = r*r;
+            valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+              valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+             /* update hit information */
+            if (unlikely(any(valid))) {
+              DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+              ishit = ishit | epilog(valid,hit);
+            }
+          }
+        }
+        return ishit;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
new file mode 100644
index 0000000000..47531027fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+#include "curve_intersector_sweep.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+#define DBG(x)
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Ray, typename Epilog>
+      struct TensorLinearCubicBezierSurfaceIntersector
+      {
+        const LinearSpace3fa& ray_space;
+        Ray& ray;
+        TensorLinearCubicBezierSurface3fa curve3d;
+        TensorLinearCubicBezierSurface2fa curve2d;
+        float eps;
+        const Epilog& epilog;
+        bool isHit;
+
+        __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog)
+          : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false)
+        {
+          const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org);
+          curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R));
+          const BBox2fa b2 = curve2d.bounds();
+          eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper)));
+        }
+        
+        __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1)
+        {
+          if (p1 == p0) {
+            if (p0 == 0.0f) return Interval1f(u0,u1);
+            else return Interval1f(empty);
+          }
+          const float t = -p0/(p1-p0);
+          const float tt = lerp(u0,u1,t);
+          return Interval1f(tt);
+        }
+
+        __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u)
+        {
+          if (sign(p0.lower) != sign(p0.upper)) u.extend(u0);
+          if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower));
+          if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper));
+          if (sign(p1.lower) != sign(p1.upper)) u.extend(u1);
+        }
+
+        __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve)
+        {
+          Interval1f u = empty;
+          solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u);
+          solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u);
+          solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u);
+          solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u);
+          solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u);
+          solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u);
+          return intersect(u,Interval1f(0.0f,1.0f));
+        }
+        
+        __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve)
+        {
+          Interval1f v = empty;
+          solve_linear(0.0f,1.0f,curve.v0,curve.v1,v);
+          return intersect(v,Interval1f(0.0f,1.0f));
+        }
+
+        __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2)
+        {
+          BBox2fa bounds = curve2.bounds();
+          if (bounds.upper.x < 0.0f) return;
+          if (bounds.upper.y < 0.0f) return;
+          if (bounds.lower.x > 0.0f) return;
+          if (bounds.lower.y > 0.0f) return;
+          
+          if (max(cu.size(),cv.size()) < 1E-4f)
+          {
+            const float u = cu.center();
+            const float v = cv.center();
+            TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+            const float t = curve_z.eval(u,v);
+            if (ray.tnear() <= t && t <= ray.tfar) {
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+            }
+            return;
+          }
+          
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return;
+          
+          const Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return;
+          TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          const Vec2fa du = curve2.axis_u();
+          const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du);
+          CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v();         
+          int roots = curve0u.maxRoots();
+          if (roots == 0) return;
+          
+          if (roots == 1)
+          {
+            const Interval1f u = bezier_clipping(curve0u);
+            if (isEmpty(u)) return;
+            TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u);
+            cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper));
+            solve_bezier_clipping(cu,cv,curve2b);
+            return;
+          }
+
+          TensorLinearCubicBezierSurface2fa curve2l, curve2r;
+          curve2a.split_u(curve2l,curve2r);
+          solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l);
+          solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r);
+        }
+        
+        __forceinline bool solve_bezier_clipping()
+        {
+          solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d);
+          return isHit;
+        }
+
+        __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv)
+        {
+          Vec2fa uv(cu.center(),cv.center());
+          const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y);
+          const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J);
+        }
+
+        __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J)
+        {
+          Vec2fa uv = uv_in;
+          
+          for (size_t i=0; i<200; i++)
+          {
+            const Vec2fa f = curve2d.eval(uv.x,uv.y);
+            const Vec2fa duv = rcp_J*f;
+            uv -= duv;
+
+            if (max(abs(f.x),abs(f.y)) < eps)
+            {
+              const float u = uv.x;
+              const float v = uv.y;
+              if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs
+              if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs
+              const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+              const float t = curve_z.eval(u,v);
+              if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+              return;
+            }
+          }       
+        }
+
+        __forceinline bool clip_v(BBox1f& cu, BBox1f& cv)
+        {
+          const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower);
+          const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return false;
+          Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return false;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+          return true;
+        }
+
+        __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv)
+        {
+          /* perform bezier clipping in v-direction to get tight v-bounds */
+          TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv);
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (unlikely(!curve0v.hasRoot())) return true;
+          Interval1f v = bezier_clipping(curve0v);
+          if (unlikely(isEmpty(v))) return true;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          curve2 = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          /* perform one newton raphson iteration */
+          Vec2fa c(cu.center(),cv.center());
+          Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          const Vec2fa c1 = c - rcp_J*f;
+          
+          /* calculate bounds of derivatives */
+          const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds();
+          const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds();
+
+          /* calculate krawczyk test */
+          LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f),
+                                           Interval1f(0.0f), Interval1f(1.0f));
+
+          LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x),
+                                           Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y));
+
+          const LinearSpace2<Vec2f> rcp_J2(rcp_J);
+          const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2);
+          
+          const Vec2<Interval1f> x(cu,cv);
+          const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c)));
+
+          /* test if there is no solution */
+          const Vec2<Interval1f> KK = intersect(K,x);
+          if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true;
+
+          /* exit if convergence cannot get proven, but terminate if we are very small */
+          if (unlikely(!subset(K,x) && !very_small)) return false;
+
+          /* solve using newton raphson iteration of convergence is guarenteed */
+          solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
+          return true;
+        }
+
+        __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv)
+        {
+           if (!clip_v(cu,cv)) return;
+           return solve_newton_raphson(cu,cv);
+        }
+        
+        __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
+        {
+          unsigned int sptr = 0;
+          const unsigned int stack_size = 4;
+          unsigned int mask_stack[stack_size];
+          BBox1f cu_stack[stack_size];
+          BBox1f cv_stack[stack_size];
+          goto entry;
+          
+          /* terminate if stack is empty */
+          while (sptr)
+          {
+            /* pop from stack */
+            {
+              sptr--;
+              size_t mask = mask_stack[sptr];
+              cu = cu_stack[sptr];
+              cv = cv_stack[sptr];
+              const size_t i = bscf(mask);
+              mask_stack[sptr] = mask;
+              if (mask) sptr++; // there are still items on the stack
+              
+              /* process next element recurse into each hit curve segment */
+              const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
+              const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+              const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
+              cu = cui;
+            }
+
+#if 0
+            solve_newton_raphson_no_recursion(cu,cv);
+            continue;
+            
+#else
+            /* we assume convergence for small u ranges and verify using krawczyk */
+            if (cu.size() < 1.0f/6.0f) {
+              const bool very_small = cu.size() < 0.001f || sptr >= stack_size;
+              if (solve_krawczyk(very_small,cu,cv)) {
+                continue;
+              }
+            }
+#endif
+
+          entry:
+          
+            /* split the curve into VSIZEX-1 segments in u-direction */
+            vboolx valid = true;
+            TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
+            
+            /* slabs test in u-direction */
+            Vec2vfx ndv = cross(subcurves.axis_v());
+            BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
+            valid &= boundsv.lower <= eps;
+            valid &= boundsv.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* slabs test in v-direction */
+            Vec2vfx ndu = cross(subcurves.axis_u());
+            BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
+            valid &= boundsu.lower <= eps;
+            valid &= boundsu.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* push valid segments to stack */
+            assert(sptr < stack_size);
+            mask_stack [sptr] = movemask(valid);
+            cu_stack   [sptr] = cu;
+            cv_stack   [sptr] = cv;
+            sptr++;
+          }
+        }
+        
+        __forceinline bool solve_newton_raphson_main()
+        {
+          BBox1f vu(0.0f,1.0f);
+          BBox1f vv(0.0f,1.0f);
+          solve_newton_raphson_recursion(vu,vv);
+          return isHit;
+        }
+      };
+
+
+    template<template<typename Ty> class SourceCurve>
+      struct OrientedCurve1Intersector1
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      __forceinline OrientedCurve1Intersector1() {}
+      
+      __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID, 
+                                const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+
+    template<template<typename Ty> class SourceCurve, int K>
+      struct OrientedCurve1IntersectorK
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                   const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const TensorLinearCubicBezierSurface3fa& curve,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
new file mode 100644
index 0000000000..6e9fc91925
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/geometry.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct CurvePrecalculations1
+    {
+      float depth_scale;
+      LinearSpace3fa ray_space;
+           
+      __forceinline CurvePrecalculations1() {}
+
+      __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr)
+      {
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        LinearSpace3fa space = frame(depth_scale*ray.dir);
+        space.vz *= depth_scale;
+        ray_space = space.transposed();
+      }
+    };
+    
+    template<int K>
+      struct CurvePrecalculationsK
+    {
+      vfloat<K> depth_scale;
+      LinearSpace3fa ray_space[K];
+
+      __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray)
+      {
+        size_t mask = movemask(valid);
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        while (mask) {
+          size_t k = bscf(mask);
+          Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+          LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k);
+          ray_space_k.vz *= depth_scale[k];
+          ray_space[k] = ray_space_k.transposed();
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
new file mode 100644
index 0000000000..a99cf99d56
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
@@ -0,0 +1,214 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "quad_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+#define Bezier1Intersector1 RibbonCurve1Intersector1
+#define Bezier1IntersectorK RibbonCurve1IntersectorK
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3ff, int M>
+    struct RibbonHit
+    {
+      __forceinline RibbonHit() {}
+
+      __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                              const NativeCurve3ff& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3ff curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+
+    /* calculate squared distance of point p0 to line p1->p2 */
+    __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+    {
+      const vfloatx num = det(p2-p1,p1-p0);
+      const vfloatx den2 = dot(p2-p1,p2-p1);
+      return std::make_pair(num*num,den2);
+    }
+    
+    /* performs culling against a cylinder */
+    __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+    {
+      const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+      return d.first <= r*r*d.second;
+    }
+
+    template<typename NativeCurve3ff, typename Epilog>
+    __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
+                                        const LinearSpace3fa& ray_space, const float& depth_scale,
+                                        const NativeCurve3ff& curve3D, const int N,
+                                        const Epilog& epilog)
+    {
+      /* transform control points into ray space */
+      const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
+      float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
+      
+      /* evaluate the bezier curve */
+      bool ishit = false;
+      vboolx valid = vfloatx(step) < vfloatx(float(N));
+      const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+      const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+      valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+      
+      if (any(valid)) 
+      {
+        Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
+        Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
+        dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+        dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+        const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+        const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+        const Vec3vfx nn0 = normalize(n0);
+        const Vec3vfx nn1 = normalize(n1);
+        const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+        const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+        
+        vfloatx vu,vv,vt;
+        vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+        if (any(valid0))
+        {
+          /* ignore self intersections */
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+            vfloatx r = lerp(p0.w, p1.w, vu);
+            valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+          }
+          
+          if (any(valid0))
+          {
+            vv = madd(2.0f,vv,vfloatx(-1.0f));
+            RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
+            ishit |= epilog(bhit.valid,bhit);
+          }
+        }
+      }
+      
+      if (unlikely(VSIZEX < N)) 
+      {
+        /* process SIMD-size many segments per iteration */
+        for (int i=VSIZEX; i<N; i+=VSIZEX)
+        {
+          /* evaluate the bezier curve */
+          vboolx valid = vintx(i)+vintx(step) < vintx(N);
+          const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+          const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+          valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+          if (none(valid)) continue;
+          
+          Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
+          Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+          const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vfx nn0 = normalize(n0);
+          const Vec3vfx nn1 = normalize(n1);
+          const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+          const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+          
+          vfloatx vu,vv,vt;
+          vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+          if (any(valid0))
+          {
+            /* ignore self intersections */
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+              vfloatx r = lerp(p0.w, p1.w, vu);
+              valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+            }
+            
+            if (any(valid0))
+            {
+              vv = madd(2.0f,vv,vfloatx(-1.0f));
+              RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+              ishit |= epilog(bhit.valid,bhit);
+            }
+          }
+        }
+      }
+      return ishit;
+    }
+        
+    template<template<typename Ty> class NativeCurve>
+    struct RibbonCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+                                                pre.ray_space,pre.depth_scale,
+                                                curve,N,
+                                                epilog);
+      }
+    };
+    
+    template<template<typename Ty> class NativeCurve, int K>
+    struct RibbonCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+                                                pre.ray_space[k],pre.depth_scale[k],
+                                                curve,N,
+                                                epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
new file mode 100644
index 0000000000..883cedc3d2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "cylinder.h"
+#include "plane.h"
+#include "line_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t numJacobianIterations = 5;
+#if defined(__AVX__)
+    static const size_t numBezierSubdivisions = 2;
+#else
+    static const size_t numBezierSubdivisions = 3;
+#endif
+
+    struct BezierCurveHit
+    {
+      __forceinline BezierCurveHit() {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng)
+        : t(t), u(u), v(0.0f), Ng(Ng) {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng)
+        : t(t), u(u), v(v), Ng(Ng) {}
+      
+      __forceinline void finalize() {}
+      
+    public:
+      float t;
+      float u;
+      float v; 
+      Vec3fa Ng;
+    };
+    
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i,
+                                                        const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1, 
+                                                        const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du,
+                                                        const Epilog& epilog)
+    {
+      if (tp.lower[i]+dt > ray.tfar) return false;
+      Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]);
+      if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]);
+      if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]);
+      BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o);
+      return epilog(hit);
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog> 
+     __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog)
+    {
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+      const float length_ray_dir = length(dir);
+
+      /* error of curve evaluations is propertional to largest coordinate */
+      const BBox3ff box = curve.bounds();
+      const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
+     
+      for (size_t i=0; i<numJacobianIterations; i++) 
+      {
+        const Vec3fa Q = madd(Vec3fa(t),dir,org);
+        //const Vec3fa dQdu = zero;
+        const Vec3fa dQdt = dir;
+        const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here
+           
+        Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu);
+        //const Vec3fa dPdt = zero;
+
+        const Vec3fa R = Q-P;
+        const float len_R = length(R); //reduce_max(abs(R));
+        const float R_err = max(Q_err,P_err);
+        const Vec3fa dRdu = /*dQdu*/-dPdu;
+        const Vec3fa dRdt = dQdt;//-dPdt;
+
+        const Vec3fa T = normalize(dPdu);
+        const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
+        //const Vec3fa dTdt = zero;
+        const float cos_err = P_err/length(dPdu);
+
+        /* Error estimate for dot(R,T):
+
+           dot(R,T) = cos(R,T) |R| |T|
+                    = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err)
+                    = cos(R,T)*|R|*|T| 
+                      +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err)
+                      +- cos_error*(|R| + |T|)
+                      +- lower order terms
+           with cos(R,T) being in [0,1] and |T| = 1 we get:
+             dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1)
+        */
+              
+        const float f = dot(R,T);
+        const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R);
+        const float dfdu = dot(dRdu,T) + dot(R,dTdu);
+        const float dfdt = dot(dRdt,T);// + dot(R,dTdt);
+
+        const float K = dot(R,R)-sqr(f);
+        const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu);
+        const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt);
+        const float rsqrt_K = rsqrt(K);
+
+        const float g = sqrt(K)-P.w;
+        const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w;
+        const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w;
+        const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w;
+
+        const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt);
+        const Vec2f dut = rcp(J)*Vec2f(f,g);
+        const Vec2f ut = Vec2f(u,t) - dut;
+        u = ut.x; t = ut.y;
+
+        if (abs(f) < f_err && abs(g) < g_err)
+        {
+          t+=dt;
+          if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs
+          if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs
+          const Vec3fa R = normalize(Q-P);
+          const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu);
+          const Vec3fa V = cross(dPdu,R);
+          BezierCurveHit hit(t,u,cross(V,U));
+          return epilog(hit);
+        }
+      }
+      return false;
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
+                                             float u0, float u1, unsigned int depth, const Epilog& epilog)
+    {
+#if defined(__AVX__)
+      typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
+      typedef vint8 vintx; 
+      typedef vfloat8 vfloatx;
+#else
+      typedef vbool4 vboolx;
+      typedef vint4 vintx; 
+      typedef vfloat4 vfloatx;
+#endif
+      typedef Vec3<vfloatx> Vec3vfx;
+      typedef Vec4<vfloatx> Vec4vfx;
+    
+      unsigned int maxDepth = numBezierSubdivisions;
+      bool found = false;
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+
+      unsigned int sptr = 0;
+      const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below
+      struct StackEntry {
+        vboolx valid;
+        vfloatx tlower;
+        float u0;
+        float u1;
+        unsigned int depth;
+      };
+      StackEntry stack[stack_size];
+      goto entry;
+
+       /* terminate if stack is empty */
+      while (sptr)
+      {
+        /* pop from stack */
+        {
+          sptr--;
+          vboolx valid = stack[sptr].valid;
+          const vfloatx tlower = stack[sptr].tlower;
+          valid &= tlower+dt <= ray.tfar;
+          if (none(valid)) continue;
+          u0 = stack[sptr].u0;
+          u1 = stack[sptr].u1;
+          depth = stack[sptr].depth;
+          const size_t i = select_min(valid,tlower); clear(valid,i);
+          stack[sptr].valid = valid;
+          if (any(valid)) sptr++; // there are still items on the stack
+
+          /* process next segment */
+          const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+          u0 = vu0[i+0];
+          u1 = vu0[i+1];
+        }
+      entry:
+
+        /* subdivide curve */
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+        Vec4vfx P0, dP0du; curve.veval(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale);
+        const Vec4vfx P3 = shift_right_1(P0);
+        const Vec4vfx dP3du = shift_right_1(dP0du); 
+        const Vec4vfx P1 = P0 + dP0du; 
+        const Vec4vfx P2 = P3 - dP3du;
+        
+        /* calculate bounding cylinders */
+        const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0));
+        const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0));
+        const vfloatx maxr12 = sqrt(max(rr1,rr2));
+        const vfloatx one_plus_ulp  = 1.0f+2.0f*float(ulp);
+        const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp);
+        vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+        vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+        r_outer = one_plus_ulp*r_outer;
+        r_inner = max(0.0f,one_minus_ulp*r_inner);
+        const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer);
+        const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner);
+        vboolx valid = true; clear(valid,vfloatx::size-1);
+        
+        /* intersect with outer cylinder */
+        BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1;
+        valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1);
+        if (none(valid)) continue;
+        
+        /* intersect with cap-planes */
+        BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt);
+        tp = embree::intersect(tp,tc_outer);
+        BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir);
+        tp = embree::intersect(tp,h0);
+        BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir);
+        tp = embree::intersect(tp,h1);
+        valid &= tp.lower <= tp.upper;
+        if (none(valid)) continue;
+        
+        /* clamp and correct u parameter */
+        u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size)));
+        u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size)));
+        
+        /* intersect with inner cylinder */
+        BBox<vfloatx> tc_inner;
+        vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero;
+        const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+        
+        /* at the unstable area we subdivide deeper */
+        const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f);
+        const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f);
+      
+        /* subtract the inner interval from the current hit interval */
+        BBox<vfloatx> tp0, tp1;
+        subtract(tp,tc_inner,tp0,tp1);
+        vboolx valid0 = valid & (tp0.lower <= tp0.upper);
+        vboolx valid1 = valid & (tp1.lower <= tp1.upper);
+        if (none(valid0 | valid1)) continue;
+        
+        /* iterate over all first hits front to back */
+        const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid0 = valid0 & (depth < termDepth0);
+        valid0 &= depth >= termDepth0;
+        
+        while (any(valid0))
+        {
+          const size_t i = select_min(valid0,tp0.lower); clear(valid0,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog);
+          valid0 &= tp0.lower+dt <= ray.tfar;
+        }
+        valid1 &= tp1.lower+dt <= ray.tfar;
+        
+        /* iterate over all second hits front to back */
+        const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid1 = valid1 & (depth < termDepth1);
+        valid1 &= depth >= termDepth1;
+        while (any(valid1))
+        {
+          const size_t i = select_min(valid1,tp1.lower); clear(valid1,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog);
+          valid1 &= tp1.lower+dt <= ray.tfar;
+        }
+
+        /* push valid segments to stack */
+        recursion_valid0 &= tp0.lower+dt <= ray.tfar;
+        recursion_valid1 &= tp1.lower+dt <= ray.tfar;
+        const vboolx recursion_valid = recursion_valid0 | recursion_valid1;
+        if (any(recursion_valid))
+        {
+          assert(sptr < stack_size);
+          stack[sptr].valid = recursion_valid;
+          stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower);
+          stack[sptr].u0 = u0;
+          stack[sptr].u1 = u1;
+          stack[sptr].depth = depth+1;
+          sptr++;
+        }
+      }
+      return found;
+    }
+
+    template<template<typename Ty> class NativeCurve>
+    struct SweepCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+
+    template<template<typename Ty> class NativeCurve, int K>
+    struct SweepCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+        
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
new file mode 100644
index 0000000000..e1f4238130
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "curve_intersector_precalculations.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+#include "intersector_epilog.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+
+#include "spherei_intersector.h"
+#include "disci_intersector.h"
+
+#include "linei_intersector.h"
+#include "roundlinei_intersector.h"
+#include "conelinei_intersector.h"
+
+#include "curveNi_intersector.h"
+#include "curveNv_intersector.h"
+#include "curveNi_mb_intersector.h"
+
+#include "curve_intersector_distance.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
+namespace embree
+{
+  struct VirtualCurveIntersector
+  {
+    typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+  public:
+    struct Intersectors
+    {
+      Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
+      
+      template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+      template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+    public:
+      Intersect1Ty intersect1;
+      Occluded1Ty  occluded1;
+      Intersect4Ty intersect4;
+      Occluded4Ty  occluded4;
+      Intersect8Ty intersect8;
+      Occluded8Ty  occluded8;
+      Intersect16Ty intersect16;
+      Occluded16Ty  occluded16;
+    };
+    
+    Intersectors vtbl[Geometry::GTY_END];
+  };
+
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+      
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+      
+#if defined(__AVX__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+#endif
+  
+#if defined(__AVX512F__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+#endif
+  
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+
+      template<int N, int Nx, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<bool robust>        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        template<int N, int Nx, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        template<int N, int Nx, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
new file mode 100644
index 0000000000..69cf612275
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
new file mode 100644
index 0000000000..d37e41098e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
new file mode 100644
index 0000000000..a133a11d63
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
new file mode 100644
index 0000000000..9aec35da45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
new file mode 100644
index 0000000000..dd37d194f5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
new file mode 100644
index 0000000000..fe5ceed840
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
@@ -0,0 +1,22 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
+
+#if defined (__AVX__)
+    void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
new file mode 100644
index 0000000000..39a582864c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
@@ -0,0 +1,223 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cylinder
+    {
+      const Vec3fa p0;  //!< start location
+      const Vec3fa p1;  //!< end position
+      const float rr;   //!< squared radius of cylinder
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) 
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) 
+        : p0(p0), p1(p1), rr(rr) {}
+
+      __forceinline bool intersect(const Vec3fa& org,
+                                   const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o,
+                                   float& u1_o, Vec3fa& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const float rl = rcp_length(p1-p0);
+        const Vec3fa P0 = p0, dP = (p1-p0)*rl;
+        const Vec3fa O = org-P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+        
+        const float A = dOdO - sqr(dOz);
+        const float B = 2.0f * (OdO - dOz*Oz);
+        const float C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) {
+          t_o = BBox1f(pos_inf,neg_inf);
+          return false;
+        }
+        
+        /* special case for rays that are parallel to the cylinder */
+        const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (abs(A) < eps) 
+        {
+          if (C <= 0.0f) {
+            t_o = BBox1f(neg_inf,pos_inf);
+            return true;
+          } else {
+            t_o = BBox1f(pos_inf,neg_inf);
+            return false;
+          }
+        }
+        
+        /* standard case for rays that are not parallel to the cylinder */
+        const float Q = sqrt(D);
+        const float rcp_2A = rcp(2.0f*A);
+        const float t0 = (-B-Q)*rcp_2A;
+        const float t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3fa Pr = t0*dir;
+          const Vec3fa Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3fa Pr = t1*dir;
+          const Vec3fa Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = t0;
+        t_o.upper = t1;
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const
+      {
+        float u0_o; Vec3fa Ng0_o;
+        float u1_o; Vec3fa Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cylinder.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cylinder class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) {
+        return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}";
+      }
+    };
+
+    template<int N>
+      struct CylinderN
+    { 
+      const Vec3vf<N> p0;     //!< start location
+      const Vec3vf<N> p1;     //!< end position
+      const vfloat<N> rr;   //!< squared radius of cylinder
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r)
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool)
+        : p0(p0), p1(p1), rr(rr) {}
+
+     
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vf<N>& Ng0_o,
+                                       vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const vfloat<N> rl = rcp_length(p1-p0);
+        const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl;
+        const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir;
+        
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> A = dOdO - sqr(dOz);
+        const vfloat<N> B = 2.0f * (OdO - dOz*Oz);
+        const vfloat<N> C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) {
+          t_o = BBox<vfloat<N>>(empty);
+          return valid;
+        }
+
+        /* standard case for rays that are not parallel to the cylinder */
+        const vfloat<N> Q = sqrt(D);
+        const vfloat<N> rcp_2A = rcp(2.0f*A);
+        const vfloat<N> t0 = (-B-Q)*rcp_2A;
+        const vfloat<N> t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t0*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+        
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t1*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = select(valid, t0, vfloat<N>(pos_inf));
+        t_o.upper = select(valid, t1, vfloat<N>(neg_inf));
+
+        /* special case for rays that are parallel to the cylinder */
+        const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        vbool<N> validt = valid & (abs(A) < eps); 
+        if (unlikely(any(validt))) 
+        {
+          vbool<N> inside = C <= 0.0f;
+          t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower);
+          t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper);
+          valid &= !validt | inside;
+        }
+        return valid;
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vf<N> Ng0_o;
+        vfloat<N> u1_o; Vec3vf<N> Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
new file mode 100644
index 0000000000..e8305780e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct DiscIntersectorHitM
+    {
+      __forceinline DiscIntersectorHitM() {}
+
+      __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng)
+      {
+      }
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const
+      {
+        return Vec2f(vu[i], vv[i]);
+      }
+      __forceinline float t(const size_t i) const
+      {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const
+      {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct DiscIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i,
+          Ray& ray,
+          IntersectContext* context,
+          const Points* geom,
+          const Precalculations& pre,
+          const Vec4vf<M>& v0i,
+          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+        
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          Ray& ray,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        vfloat<M> divisor       = dot(Vec3vf<M>((Vec3fa)ray.dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+
+    template<int M, int K>
+    struct DiscIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+        
+        vfloat<M> divisor       = dot(Vec3vf<M>(ray_dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
new file mode 100644
index 0000000000..e1dc3aa98e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "disc_intersector.h"
+#include "intersector_epilog.h"
+#include "pointi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct DiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct DiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct DiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct DiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct OrientedDiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct OrientedDiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct OrientedDiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct OrientedDiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/filter.h b/thirdparty/embree-aarch64/kernels/geometry/filter.h
new file mode 100644
index 0000000000..4cdf7a395a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/filter.h
@@ -0,0 +1,204 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+            
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit);
+      return true;
+    }
+    
+    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runIntersectionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->intersectionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+    
+    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      return true;
+    }
+
+    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runOcclusionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->occlusionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return false;
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+
+    template<int K>
+      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+      
+      copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit);
+      return valid_o;
+    }
+    
+    template<int K>
+    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runIntersectionFilterHelper<K>(&args,geometry,context);
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+
+      RayK<K>* ray = (RayK<K>*) args->ray;
+      ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar);
+      return valid_o;
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runOcclusionFilterHelper<K>(&args,geometry,context);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
new file mode 100644
index 0000000000..46a0af0827
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
@@ -0,0 +1,99 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class Grid1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
+      {
+        GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        intersect(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        return occluded(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+    template <int K>
+      struct GridIntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      
+      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+      }
+    };
+
+    typedef Grid1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef Grid1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef Grid1IntersectorK<16> SubdivPatch1Intersector16;
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
new file mode 100644
index 0000000000..d3b275586c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
@@ -0,0 +1,275 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_subdiv_mesh.h"
+#include "../bvh/bvh.h"
+#include "../subdiv/tessellation.h"
+#include "../subdiv/tessellation_cache.h"
+#include "subdivpatch1.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOA
+    {
+    public:
+
+      /*! GridSOA constructor */
+      GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps,
+              const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight,
+              const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr);
+
+      /*! Subgrid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps,
+                               unsigned x0, unsigned x1, unsigned y0, unsigned y1, 
+                               const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr)
+      {
+        const unsigned width = x1-x0+1;  
+        const unsigned height = y1-y0+1; 
+        const GridRange range(0,width-1,0,height-1);
+        size_t bvhBytes = 0;
+        if (time_steps == 1) 
+          bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0);
+        else {
+          bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0);
+          bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D));
+        }
+        const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);  
+        size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
+#if !defined(__X86_64__) && !defined(__aarch64__)
+        rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
+#endif
+        void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
+        assert(data);
+        return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o);
+      }
+
+      /*! Grid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps,
+                               const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) 
+      {
+        return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o);
+      }
+
+       /*! returns reference to root */
+      __forceinline       BVH4::NodeRef& root(size_t t = 0)       { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+      __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+
+      /*! returns pointer to BVH array */
+      __forceinline       int8_t* bvhData()       { return &data[0]; }
+      __forceinline const int8_t* bvhData() const { return &data[0]; }
+
+      /*! returns pointer to Grid array */
+      __forceinline       float* gridData(size_t t = 0)       { return (float*) &data[gridOffset + t*gridBytes]; }
+      __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; }
+      
+      __forceinline void* encodeLeaf(size_t u, size_t v) {
+        return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf
+      }
+      __forceinline float* decodeLeaf(size_t t, const void* ptr) {
+        return gridData(t) + (((size_t) (ptr) >> 4) - 1);
+      }
+
+      /*! returns the size of the BVH over the grid in bytes */
+      static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes);
+
+      /*! returns the size of the temporal BVH over the time range BVHs */
+      static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes);
+
+      /*! calculates bounding box of grid range */
+      __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const
+      {
+        const float* const grid_array = gridData(time);
+        const float* const grid_x_array = grid_array + 0 * dim_offset;
+        const float* const grid_y_array = grid_array + 1 * dim_offset;
+        const float* const grid_z_array = grid_array + 2 * dim_offset;
+        
+        /* compute the bounds just for the range! */
+        BBox3fa bounds( empty );
+        for (unsigned v = range.v_start; v<=range.v_end; v++) 
+        {
+          for (unsigned u = range.u_start; u<=range.u_end; u++)
+          {
+            const float x = grid_x_array[ v * width + u];
+            const float y = grid_y_array[ v * width + u];
+            const float z = grid_z_array[ v * width + u];
+            bounds.extend( Vec3fa(x,y,z) );
+          }
+        }
+        assert(is_finite(bounds));
+        return bounds;
+      }
+
+      /*! Evaluates grid over patch and builds BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o);
+      
+      /*! Create BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator);
+
+      /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o);
+      
+      /*! Create MBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator);
+
+      /*! Create MSMBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o);
+
+      template<typename Loader>
+        struct MapUV
+      {
+        typedef typename Loader::vfloat vfloat;
+        const float* const grid_uv;
+        size_t line_offset;
+        size_t lines;
+
+        __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines)
+          : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {}
+
+        __forceinline void operator() (vfloat& u, vfloat& v) const {
+          const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines);	
+          const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]);
+          const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]);
+          const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]);        
+          const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0;        
+          u = uv[0];v = uv[1]; 
+        }
+      };
+
+      struct Gather2x3
+      {
+        enum { M = 4 };
+        typedef vbool4 vbool;
+        typedef vint4 vint;
+        typedef vfloat4 vfloat;
+        
+        static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          if (unlikely(line_offset == 2))
+          {
+            r0 = shuffle<0,1,1,1>(r0);
+            r1 = shuffle<0,1,1,1>(r1);
+          }
+          return Vec3vf4(unpacklo(r0,r1),       // r00, r10, r01, r11
+                         shuffle<1,1,2,2>(r0),  // r01, r01, r02, r02
+                         shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf4& v0_o,
+                                         Vec3vf4& v1_o,
+                                         Vec3vf4& v2_o)
+        {
+          const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+      
+#if defined (__AVX__)
+      struct Gather3x3
+      {
+        enum { M = 8 };
+        typedef vbool8 vbool;
+        typedef vint8 vint;
+        typedef vfloat8 vfloat;
+        
+        static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 ra = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          vfloat4 rc;
+          if (likely(lines > 2)) 
+            rc = vfloat4::loadu(grid + 2*line_offset);
+          else                   
+            rc = rb;
+
+          if (unlikely(line_offset == 2))
+          {
+            ra = shuffle<0,1,1,1>(ra);
+            rb = shuffle<0,1,1,1>(rb);
+            rc = shuffle<0,1,1,1>(rc);
+          }
+          
+          const vfloat8 r0 = vfloat8(ra,rb);
+          const vfloat8 r1 = vfloat8(rb,rc);
+          return Vec3vf8(unpacklo(r0,r1),         // r00, r10, r01, r11, r10, r20, r11, r21
+                         shuffle<1,1,2,2>(r0),    // r01, r01, r02, r02, r11, r11, r12, r12
+                         shuffle<0,1,1,2>(r1));   // r10, r11, r11, r12, r20, r21, r21, r22
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf8& v0_o,
+                                         Vec3vf8& v1_o,
+                                         Vec3vf8& v2_o)
+        {
+          const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+#endif
+
+      template<typename vfloat>
+      static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv)
+      {
+        typedef typename vfloat::Int vint;
+        const vint iu  = asInt(uv) & 0xffff;
+        const vint iv  = srl(asInt(uv),16);
+	const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
+	const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+	return Vec2<vfloat>(u,v);
+      }
+      
+      __forceinline unsigned int geomID() const  {
+        return _geomID;
+      } 
+      
+      __forceinline unsigned int primID() const  {
+        return _primID;
+      } 
+
+    public:
+      BVH4::NodeRef troot;
+#if !defined(__X86_64__) && !defined(__aarch64__)
+      unsigned align1;
+#endif
+      unsigned time_steps;
+      unsigned width;
+
+      unsigned height;
+      unsigned dim_offset;
+      unsigned _geomID;
+      unsigned _primID;
+
+      unsigned align2;
+      unsigned gridOffset;
+      unsigned gridBytes;
+      unsigned rootOffset;
+
+      int8_t data[1];      //!< after the struct we first store the BVH, then the grid, and finally the roots
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
new file mode 100644
index 0000000000..2ed922a5ae
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOAIntersector1
+    {
+    public:
+      typedef void Primitive;
+      
+      class Precalculations
+      { 
+      public:
+        __forceinline Precalculations (const Ray& ray, const void* ptr)
+          : grid(nullptr) {}
+        
+      public:
+        GridSOA* grid;
+        int itime;
+        float ftime;
+      };
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);       
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+
+    class GridSOAMBIntersector1
+    {
+    public:
+      typedef void Primitive;
+      typedef GridSOAIntersector1::Precalculations Precalculations;
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray, const float ftime,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray, const float ftime,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+       
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      { 
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
new file mode 100644
index 0000000000..41d66e1e28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
@@ -0,0 +1,445 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int K>
+      struct MapUV0
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      struct MapUV1
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      class GridSOAIntersectorK
+    {
+    public:
+      typedef void Primitive;
+
+      class Precalculations
+      {
+#if defined(__AVX__)
+        static const int M = 8;
+#else
+        static const int M = 4;
+#endif
+
+      public:
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray)
+          : grid(nullptr), intersector(valid,ray) {}
+
+      public:
+        GridSOA* grid;
+        PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector
+      };
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return !valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+
+    template<int K>
+    class GridSOAMBIntersectorK
+    {
+    public:
+      typedef void Primitive;
+      typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const size_t j = bsf(movemask(valid1));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid_o = valid_i;
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const int j = int(bsf(movemask(valid1)));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+        return !valid_o;
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            const float ftime,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           const float ftime,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      { 
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance.h b/thirdparty/embree-aarch64/kernels/geometry/instance.h
new file mode 100644
index 0000000000..66893d581f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance.h"
+
+namespace embree
+{
+  struct InstancePrimitive
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    InstancePrimitive (const Instance* instance, unsigned int instID) 
+    : instance(instance) 
+    , instID_(instID)
+    {}
+
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance, geomID);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      assert(end-i == 1);
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(Instance* instance) {
+      return instance->bounds(0);
+    }
+
+  public:
+    const Instance* instance;
+    const unsigned int instID_ = std::numeric_limits<unsigned int>::max ();
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
new file mode 100644
index 0000000000..91731a39c5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct InstanceIntersector1
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    struct InstanceIntersector1MB
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    template<int K>
+      struct InstanceIntersectorK
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    template<int K>
+      struct InstanceIntersectorKMB
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
new file mode 100644
index 0000000000..0df49dd6e9
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
@@ -0,0 +1,1074 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct UVIdentity {
+      __forceinline void operator() (vfloat<M>& u, vfloat<M>& v) const {}
+    };
+
+
+    template<bool filter>
+    struct Intersect1Epilog1
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1Epilog1(RayHit& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar = hit.t;
+        ray.Ng = hit.Ng;
+        ray.u = hit.u;
+        ray.v = hit.v;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+
+    template<bool filter>
+    struct Occluded1Epilog1
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1Epilog1(Ray& ray,
+                                     IntersectContext* context,
+                                     const unsigned int geomID,
+                                     const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            const bool found = runOcclusionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+        return true;
+      }
+    };
+
+    template<int K, bool filter>
+    struct Intersect1KEpilog1
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar[k] = hit.t;
+        ray.Ng.x[k] = hit.Ng.x;
+        ray.Ng.y[k] = hit.Ng.y;
+        ray.Ng.z[k] = hit.Ng.z;
+        ray.u[k] = hit.u;
+        ray.v[k] = hit.v;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int K, bool filter>
+    struct Occluded1KEpilog1
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            hit.finalize();
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+    
+    template<int M, int Mx, bool filter>
+    struct Intersect1EpilogM
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar = hit.vt[i];
+        ray.Ng.x = hit.vNg.x[i];
+        ray.Ng.y = hit.vNg.y[i];
+        ray.Ng.z = hit.vNg.z[i];
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primIDs[i];
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+
+      }
+    };
+
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+    template<int M, bool filter>
+    struct Intersect1EpilogM<M,16,filter>
+    {
+      static const size_t Mx = 16;
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* MAYBE_UNUSED scene = context->scene;
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        vbool<Mx> finalMask(((unsigned int)1 << i));
+        ray.update(finalMask,hit.vt,hit.vu,hit.vv,hit.vNg.x,hit.vNg.y,hit.vNg.z,geomID,primIDs);
+        instance_id_stack::foreach([&](unsigned level)
+        {
+          ray.instID[level] = context->user->instID[level];
+          return (context->user->instID[level] != RTC_INVALID_GEOMETRY_ID);
+        });
+        return true;
+
+      }
+    };
+#endif    
+    
+    template<int M, int Mx, bool filter>
+    struct Occluded1EpilogM
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1EpilogM(Ray& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* if we have no filter then the test passed */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+              ray.tfar = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        return true;
+      }
+    };
+
+    template<int M, bool filter>
+    struct Intersect1EpilogMU
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1EpilogMU(RayHit& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        vbool<M> valid = valid_i;
+        hit.finalize();
+
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+        {
+          bool foundhit = false;
+          while (true)
+          {
+            /* call intersection filter function */
+            Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            const bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            foundhit |= found;
+            clear(valid,i);
+            valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+            if (unlikely(none(valid))) break;
+            i = select_min(valid,hit.vt);
+          }
+          return foundhit;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar = hit.t(i);
+        ray.Ng.x = Ng.x;
+        ray.Ng.y = Ng.y;
+        ray.Ng.z = Ng.z;
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Occluded1EpilogMU
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1EpilogMU(Ray& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+        {
+          hit.finalize();
+          for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+          {
+            const Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+            ray.tfar = old_t;
+          }
+          return false;
+        }
+#endif
+        return true;
+      }
+    };
+        
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogM
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline IntersectKEpilogM(RayHitK<K>& ray,
+                                      IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        vbool<K> valid = valid_i;
+
+        std::tie(u,v,t,Ng) = hit();
+
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogM
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline OccludedKEpilogM(vbool<K>& valid0,
+                                     RayK<K>& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+
+        /* ray masking test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return valid;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogMU
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const
+      {
+        vbool<K> valid = valid_org;
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        std::tie(u,v,t,Ng) = hit();
+
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogMU
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline OccludedKEpilogMU(vbool<K>& valid0,
+                                      RayK<K>& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int Mx, int K, bool filter>
+    struct Intersect1KEpilogM
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const vuint<M>& geomIDs,
+                                       const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<Mx> valid = valid_i;
+        hit.finalize();
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t i = select_min(valid,hit.vt);
+        assert(i<M);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+          assert(i<M);
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              assert(i<M);
+              const Vec2f uv = hit.uv(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        assert(i<M);
+        /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+        ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<Mx>(hit.vNg.x),vfloat<Mx>(hit.vNg.y),vfloat<Mx>(hit.vNg.z),geomID,vuint<Mx>(primIDs));
+#else
+        const Vec2f uv = hit.uv(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = hit.vNg.x[i];
+        ray.Ng.y[k] = hit.vNg.y[i];
+        ray.Ng.z[k] = hit.vNg.z[i];
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primIDs[i];
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int Mx, int K, bool filter>
+    struct Occluded1KEpilogM
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* execute occlusion filer */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogMU
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
+                                        IntersectContext* context,
+                                        const unsigned int geomID,
+                                        const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* finalize hit calculation */
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+          {
+            bool foundhit = false;
+            while (true)
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              if (unlikely(none(valid))) break;
+              i = select_min(valid,hit.vt);
+            }
+            return foundhit;
+          }
+        }
+#endif
+
+        /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+        const Vec3fa Ng = hit.Ng(i);
+        ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<M>(Ng.x),vfloat<M>(Ng.y),vfloat<M>(Ng.z),geomID,vuint<M>(primID));
+#else
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = Ng.x;
+        ray.Ng.y[k] = Ng.y;
+        ray.Ng.z[k] = Ng.z;
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogMU
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            hit.finalize();
+            for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+            }
+            return false;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
new file mode 100644
index 0000000000..5c1ba5cb61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
@@ -0,0 +1,172 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Intersector>
+    struct ArrayIntersector1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++)
+          Intersector::intersect(pre,ray,context,prim[i]);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+      
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0; i<num; i++)
+          changed |= Intersector::pointQuery(query, context, prim[i]);
+        return changed;
+      }
+
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        return valid;
+      }
+    };
+
+    template<int K, typename Intersector>
+    struct ArrayIntersectorK_1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+    };
+
+    // =============================================================================================
+
+    template<int K, typename IntersectorK>
+    struct ArrayIntersectorKStream
+    {
+      typedef typename IntersectorK::Primitive PrimitiveK;
+      typedef typename IntersectorK::Precalculations PrecalculationsK;
+
+      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+
+      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        size_t m_occluded = 0;
+        for (size_t i=0; i<num; i++) {
+          size_t bits = cur_mask & (~m_occluded);
+          for (; bits!=0; )
+          {
+            const size_t rayID = bscf(bits);
+            RayHitK<K> &ray = *inputPackets[rayID / K];
+            const size_t k = rayID % K;
+            PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+            if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            {
+              m_occluded |= (size_t)1 << rayID;
+              ray.tfar[k] = neg_inf;
+            }
+          }
+        }
+        return m_occluded;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
new file mode 100644
index 0000000000..eef5b0b1fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct LineIntersectorHitM
+      {
+        __forceinline LineIntersectorHitM() {}
+
+        __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+        
+        __forceinline void finalize() {}
+        
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+        
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct FlatLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale;
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space;
+
+          const Vec3vf<M> ray_org ((Vec3fa)ray.org);
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+    
+    template<int M, int K>
+      struct FlatLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale[k];
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k];
+          const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei.h b/thirdparty/embree-aarch64/kernels/geometry/linei.h
new file mode 100644
index 0000000000..a72029ca53
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei.h
@@ -0,0 +1,709 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct LineMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;      
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline LineMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype)
+      : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+      /* Returns a mask that tells which line segments are valid */
+    template<int Mx>
+    __forceinline vbool<Mx> valid() const { return vuint<Mx>(primIDs) != vuint<Mx>(-1); }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    //template<class T>
+    //static __forceinline T unmask(T &index) { return index & 0x3fffffff; }
+
+    __forceinline     unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; }
+    //__forceinline       vuint<M> geomID()       { return unmask(geomIDs); }
+    //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); }
+    //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* gather the line segments with lateral info */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               Vec4vf<M>& pL,
+                               Vec4vf<M>& pR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               vbool<M>& cL,
+                               vbool<M>& cR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        const Vec3ff& p0 = geom->vertex(v0[i]+0,itime);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1,itime);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) 
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      unsigned short leftExists = 0;
+      unsigned short rightExists = 0;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(prim->geomID());
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          v0[i] = geom->segment(prim->primID());
+          leftExists |= geom->segmentLeftExists(primID[i]) << i;
+          rightExists |= geom->segmentRightExists(primID[i]) << i;         
+          begin++;
+        } else {
+          assert(i);
+          if (i>0) {
+            geomID[i] = geomID[i-1];
+            primID[i] = -1;
+            v0[i] = v0[i-1];
+          }
+        }
+        if (begin<end) prim = &prims[begin]; // FIXME: remove this line
+      }
+      new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store
+    }
+
+     template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = LineMi::blocks(set.size());
+      size_t numbytes = LineMi::bytes(set.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,time_range);
+    }
+
+      template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = LineMi::blocks(prims.size());
+      size_t numbytes = LineMi::bytes(prims.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(LineSegments* geom)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const Vec3ff& p0 = geom->vertex(v0[i]+0);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+    
+  public:
+    unsigned char gtype;
+    unsigned char m;
+    unsigned int sharedGeomID;
+    unsigned short leftExists, rightExists;
+    vuint<M> v0;      // index of start vertex
+  private:
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                       Vec4vf4& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                          Vec4vf4& p1,
+                                          vbool4&  cL,
+                                          vbool4&  cR,
+                                          const LineSegments* geom,
+                                          const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf4 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    pL = lerp(aL,bL,vfloat4(ftime));
+    pR = lerp(aR,bR,vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                       Vec4vf8& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf8 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    pL = lerp(aL,bL,vfloat8(ftime));
+    pR = lerp(aR,bR,vfloat8(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         vbool8& cL,
+                                         vbool8& cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+#endif
+  
+  template<int M>
+  typename LineMi<M>::Type LineMi<M>::type;
+
+  typedef LineMi<4> Line4i;
+  typedef LineMi<8> Line8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
new file mode 100644
index 0000000000..a431796a88
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
@@ -0,0 +1,124 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linei.h"
+#include "line_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct FlatLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct FlatLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct FlatLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct FlatLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object.h b/thirdparty/embree-aarch64/kernels/geometry/object.h
new file mode 100644
index 0000000000..f26391de52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  struct Object
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    /*! constructs a virtual object */
+    Object (unsigned geomID, unsigned primID) 
+    : _geomID(geomID), _primID(primID) {}
+
+    __forceinline unsigned geomID() const {
+      return _geomID;
+    }
+
+    __forceinline unsigned primID() const {
+      return _primID;
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      new (this) Object(prim.geomID(), prim.primID());
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,itime);
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(AccelSet* mesh) {
+      return mesh->bounds(primID());
+    }
+
+  private:
+    unsigned int _geomID;  //!< geometry ID
+    unsigned int _primID;  //!< primitive ID
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
new file mode 100644
index 0000000000..97882e0e59
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "object.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<bool mblur>
+    struct ObjectIntersector1
+    {
+      typedef Object Primitive;
+     
+      static const bool validIntersectorK = false;
+
+      struct Precalculations {
+        __forceinline Precalculations() {}
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) 
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return;
+#endif
+
+        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+      }
+      
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return false;
+#endif
+
+        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID());
+        context->geomID = prim.geomID();
+        context->primID = prim.primID();
+        return accel->pointQuery(query, context);
+      }
+      
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+        return valid;
+      }
+    };
+
+    template<int K, bool mblur>
+    struct ObjectIntersectorK
+    {
+      typedef Object Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return;
+#endif
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return false;
+#endif
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    typedef ObjectIntersectorK<4,false>  ObjectIntersector4;
+    typedef ObjectIntersectorK<8,false>  ObjectIntersector8;
+    typedef ObjectIntersectorK<16,false> ObjectIntersector16;
+
+    typedef ObjectIntersectorK<4,true>  ObjectIntersector4MB;
+    typedef ObjectIntersectorK<8,true>  ObjectIntersector8MB;
+    typedef ObjectIntersectorK<16,true> ObjectIntersector16MB;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/plane.h b/thirdparty/embree-aarch64/kernels/geometry/plane.h
new file mode 100644
index 0000000000..ebe45db558
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/plane.h
@@ -0,0 +1,57 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct HalfPlane
+    {
+      const Vec3fa P;  //!< plane origin
+      const Vec3fa N;  //!< plane normal
+
+      __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) 
+        : P(P), N(N) {}
+      
+      __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+      {
+        Vec3fa O = Vec3fa(ray_org) - P;
+        Vec3fa D = Vec3fa(ray_dir);
+        float ON = dot(O,N);
+        float DN = dot(D,N);
+        bool eps = abs(DN) < min_rcp_input;
+        float t = -ON*rcp(DN);
+        float lower = select(eps || DN < 0.0f, float(neg_inf), t);
+        float upper = select(eps || DN > 0.0f, float(pos_inf), t);
+        return BBox1f(lower,upper);
+      }
+    };
+
+    template<int M>
+      struct HalfPlaneN
+      {
+        const Vec3vf<M> P;  //!< plane origin
+        const Vec3vf<M> N;  //!< plane normal
+
+        __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N)
+          : P(P), N(N) {}
+
+        __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+        {
+          Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P;
+          Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir);
+          vfloat<M> ON = dot(O,N);
+          vfloat<M> DN = dot(D,N);
+          vbool<M> eps = abs(DN) < min_rcp_input;
+          vfloat<M> t = -ON*rcp(DN);
+          vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t);
+          vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t);
+          return BBox<vfloat<M>>(lower,upper);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/pointi.h b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
new file mode 100644
index 0000000000..4ba298e86b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct PointMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+   public:
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size()
+    {
+      return M;
+    }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N)
+    {
+      return (N + max_size() - 1) / max_size();
+    }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N)
+    {
+      return blocks(N) * sizeof(PointMi);
+    }
+
+   public:
+    /* Default constructor */
+    __forceinline PointMi() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives)
+        : gtype((unsigned char)gtype),
+          numPrimitives(numPrimitives),
+          sharedGeomID(geomIDs[0]),
+          primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const {
+      return vint<M>(step) < vint<M>(numPrimitives);
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    template<int Mx> __forceinline vbool<Mx> valid() const {
+      return vint<Mx>(step) < vint<Mx>(numPrimitives);
+    }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const
+    {
+      assert(i < M);
+      return i < numPrimitives;
+    }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const {
+      return numPrimitives;
+    }
+
+    __forceinline unsigned int geomID(unsigned int i = 0) const {
+      return sharedGeomID;
+    }
+
+    __forceinline vuint<M>& primID() {
+      return primIDs;
+    }
+    __forceinline const vuint<M>& primID() const {
+      return primIDs;
+    }
+    __forceinline unsigned int primID(const size_t i) const {
+      assert(i < M);
+      return primIDs[i];
+    }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const;
+    __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        bounds.extend(geom->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      const PrimRefT* prim = &prims[begin];
+
+      int numPrimitives = 0;
+      for (size_t i = 0; i < M; i++) {
+        if (begin < end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          begin++;
+          numPrimitives++;
+        } else {
+          assert(i);
+          if (i > 0) {
+            geomID[i] = geomID[i - 1];
+            primID[i] = primID[i - 1];
+          }
+        }
+        if (begin < end)
+          prim = &prims[begin];  // FIXME: remove this line
+      }
+      new (this) PointMi(geomID, primID, gty, numPrimitives);  // FIXME: use non temporal store
+    }
+
+    template<typename BVH, typename Allocator>
+    __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh,
+                                                          const PrimRef* prims,
+                                                          const range<size_t>& set,
+                                                          const Allocator& alloc)
+    {
+      size_t start    = set.begin();
+      size_t items    = PointMi::blocks(set.size());
+      size_t numbytes = PointMi::bytes(set.size());
+      PointMi* accel  = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      for (size_t i = 0; i < items; i++) {
+        accel[i].fill(prims, start, set.end(), bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel, items);
+    };
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(
+        const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start                     = prims.object_range.begin();
+      size_t end                       = prims.object_range.end();
+      size_t items                     = PointMi::blocks(prims.object_range.size());
+      size_t numbytes                  = PointMi::bytes(prims.object_range.size());
+      PointMi* accel                   = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items);
+
+      LBBox3fa bounds = empty;
+      for (size_t i = 0; i < items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range));
+
+      return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range);
+    };
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line)
+    {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+
+   public:
+    unsigned char gtype;
+    unsigned char numPrimitives;
+    unsigned int sharedGeomID;
+
+   private:
+    vuint<M> primIDs;  // primitive ID
+  };
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0; gatheri(a0, geom, itime);
+    Vec4vf4 b0; gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0, b0;
+    Vec3vf4 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+    n0 = lerp(norm0, norm1, vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4)));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5)));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6)));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7)));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0;
+    gatheri(a0, geom, itime);
+    Vec4vf8 b0;
+    gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0, b0;
+    Vec3vf8 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+    n0 = lerp(norm0, norm1, vfloat8(ftime));
+  }
+#endif
+
+  template<int M>
+  typename PointMi<M>::Type PointMi<M>::type;
+
+  typedef PointMi<4> Point4i;
+  typedef PointMi<8> Point8i;
+  
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive.h b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
new file mode 100644
index 0000000000..41e5b2b304
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene.h"
+#include "../../common/simd/simd.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  struct PrimitiveType
+  {
+    /*! returns name of this primitive type */
+    virtual const char* name() const = 0;
+    
+    /*! Returns the number of stored active primitives in a block. */
+    virtual size_t sizeActive(const char* This) const = 0;
+
+    /*! Returns the number of stored active and inactive primitives in a block. */
+    virtual size_t sizeTotal(const char* This) const = 0;
+
+    /*! Returns the number of bytes of block. */
+    virtual size_t getBytes(const char* This) const = 0;
+  };
+  
+  template<typename Primitive>
+  struct PrimitivePointQuery1
+  {
+    static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+    {
+      bool changed = false;
+      for (size_t i = 0; i < Primitive::max_size(); i++)
+      {
+        if (!prim.valid(i)) break;
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i));
+        context->geomID = prim.geomID(i);
+        context->primID = prim.primID(i);
+        changed |= accel->pointQuery(query, context);
+      }
+      return changed;
+    }
+    
+    static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
new file mode 100644
index 0000000000..f93574c9c8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
@@ -0,0 +1,379 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+#include "curveNv.h"
+#include "curveNi.h"
+#include "curveNi_mb.h"
+#include "linei.h"
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "trianglei.h"
+#include "quadv.h"
+#include "quadi.h"
+#include "subdivpatch1.h"
+#include "object.h"
+#include "instance.h"
+#include "subgrid.h"
+
+namespace embree
+{
+  /********************** Curve4v **************************/
+
+  template<>
+  const char* Curve4v::Type::name () const {
+    return "curve4v";
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::getBytes(const char* This) const
+  {
+     if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+     else
+        return Curve4v::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4i **************************/
+
+  template<>
+  const char* Curve4i::Type::name () const {
+    return "curve4i";
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4i::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4iMB **************************/
+
+  template<>
+  const char* Curve4iMB::Type::name () const {
+    return "curve4imb";
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4iMB::bytes(sizeActive(This));
+  }
+
+  /********************** Line4i **************************/
+
+  template<>
+  const char* Line4i::Type::name () const {
+    return "line4i";
+  }
+
+  template<>
+  size_t Line4i::Type::sizeActive(const char* This) const {
+    return ((Line4i*)This)->size();
+  }
+
+  template<>
+  size_t Line4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Line4i::Type::getBytes(const char* This) const {
+    return sizeof(Line4i);
+  }
+
+  /********************** Triangle4 **************************/
+
+  template<>
+  const char* Triangle4::Type::name () const {
+    return "triangle4";
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeActive(const char* This) const {
+    return ((Triangle4*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4);
+  }
+
+  /********************** Triangle4v **************************/
+
+  template<>
+  const char* Triangle4v::Type::name () const {
+    return "triangle4v";
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeActive(const char* This) const {
+    return ((Triangle4v*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4v::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4v);
+  }
+
+  /********************** Triangle4i **************************/
+
+  template<>
+  const char* Triangle4i::Type::name () const {
+    return "triangle4i";
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeActive(const char* This) const {
+    return ((Triangle4i*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4i::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4i);
+  }
+
+  /********************** Triangle4vMB **************************/
+
+  template<>
+  const char* Triangle4vMB::Type::name () const {
+    return  "triangle4vmb";
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeActive(const char* This) const {
+    return ((Triangle4vMB*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4vMB);
+  }
+
+  /********************** Quad4v **************************/
+
+  template<>
+  const char* Quad4v::Type::name () const {
+    return "quad4v";
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeActive(const char* This) const {
+    return ((Quad4v*)This)->size();
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4v::Type::getBytes(const char* This) const {
+    return sizeof(Quad4v);
+  }
+
+  /********************** Quad4i **************************/
+
+  template<>
+  const char* Quad4i::Type::name () const {
+    return "quad4i";
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeActive(const char* This) const {
+    return ((Quad4i*)This)->size();
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4i::Type::getBytes(const char* This) const {
+    return sizeof(Quad4i);
+  }
+
+  /********************** SubdivPatch1 **************************/
+
+  const char* SubdivPatch1::Type::name () const {
+    return "subdivpatch1";
+  }
+
+  size_t SubdivPatch1::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::getBytes(const char* This) const {
+    return sizeof(SubdivPatch1);
+  }
+
+  SubdivPatch1::Type SubdivPatch1::type;
+
+  /********************** Virtual Object **************************/
+
+  const char* Object::Type::name () const {
+    return "object";
+  }
+
+  size_t Object::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::getBytes(const char* This) const {
+    return sizeof(Object);
+  }
+
+  Object::Type Object::type;
+
+  /********************** Instance **************************/
+
+  const char* InstancePrimitive::Type::name () const {
+    return "instance";
+  }
+
+  size_t InstancePrimitive::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::getBytes(const char* This) const {
+    return sizeof(InstancePrimitive);
+  }
+
+  InstancePrimitive::Type InstancePrimitive::type;
+
+  /********************** SubGrid **************************/
+
+  const char* SubGrid::Type::name () const {
+    return "subgrid";
+  }
+
+  size_t SubGrid::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::getBytes(const char* This) const {
+    return sizeof(SubGrid);
+  }
+
+  SubGrid::Type SubGrid::type;
+  
+  /********************** SubGridQBVH4 **************************/
+
+  template<>
+  const char* SubGridQBVH4::Type::name () const {
+    return "SubGridQBVH4";
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::getBytes(const char* This) const {
+    return sizeof(SubGridQBVH4);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
new file mode 100644
index 0000000000..57ff4e60e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects a ray with a quad with backface culling
+     *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+     *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+     *  triangles gets intersected. */
+    template<int N>
+    __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0,
+                                                           const Vec3fa& ray_org,
+                                                           const Vec3fa& ray_dir,
+                                                           const float ray_tnear,
+                                                           const float ray_tfar,
+                                                           const Vec3vf<N>& quad_v0,
+                                                           const Vec3vf<N>& quad_v1,
+                                                           const Vec3vf<N>& quad_v2,
+                                                           const Vec3vf<N>& quad_v3,
+                                                           vfloat<N>& u_o,
+                                                           vfloat<N>& v_o,
+                                                           vfloat<N>& t_o)
+    {
+      /* calculate vertices relative to ray origin */
+      vbool<N> valid = valid0;
+      const Vec3vf<N> O = Vec3vf<N>(ray_org);
+      const Vec3vf<N> D = Vec3vf<N>(ray_dir);
+      const Vec3vf<N> va = quad_v0-O;
+      const Vec3vf<N> vb = quad_v1-O;
+      const Vec3vf<N> vc = quad_v2-O;
+      const Vec3vf<N> vd = quad_v3-O;
+
+      const Vec3vf<N> edb = vb-vd;
+      const vfloat<N> WW = dot(cross(vd,edb),D);
+      const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc);
+      const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd);
+      const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb);
+
+      /* calculate edges */
+      const Vec3vf<N> e0 = v2-v0;
+      const Vec3vf<N> e1 = v0-v1;
+
+      /* perform edge tests */
+      const vfloat<N> U = dot(cross(v0,e0),D);
+      const vfloat<N> V = dot(cross(v1,e1),D);
+      valid &= max(U,V) <= 0.0f;
+      if (unlikely(none(valid))) return false;
+
+      /* calculate geometry normal and denominator */
+      const Vec3vf<N> Ng = cross(e1,e0);
+      const vfloat<N> den = dot(Ng,D);
+      const vfloat<N> rcpDen = rcp(den);
+
+      /* perform depth test */
+      const vfloat<N> t = rcpDen*dot(v0,Ng);
+      valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar);
+      if (unlikely(none(valid))) return false;
+
+      /* avoid division by 0 */
+      valid &= den != vfloat<N>(zero);
+      if (unlikely(none(valid))) return false;
+
+      /* update hit information */
+      t_o = t;
+      u_o = U * rcpDen;
+      v_o = V * rcpDen;
+      u_o = select(WW <= 0.0f,u_o,1.0f-u_o);
+      v_o = select(WW <= 0.0f,v_o,1.0f-v_o);
+      return valid;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
new file mode 100644
index 0000000000..74e8c7720c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
@@ -0,0 +1,566 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitM
+    {
+      __forceinline QuadHitM() {}
+
+      __forceinline QuadHitM(const vbool<M>& valid,
+                             const vfloat<M>& U,
+                             const vfloat<M>& V,
+                             const vfloat<M>& T,
+                             const vfloat<M>& absDen,
+                             const Vec3vf<M>& Ng,
+                             const vbool<M>& flags)
+        : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        const vfloat<M> u = min(U * rcpAbsDen,1.0f);
+        const vfloat<M> v = min(V * rcpAbsDen,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitK
+    {
+      __forceinline QuadHitK(const vfloat<K>& U,
+                             const vfloat<K>& V,
+                             const vfloat<K>& T,
+                             const vfloat<K>& absDen,
+                             const Vec3vf<K>& Ng,
+                             const vbool<K>& flags)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+        const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore;
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          epilog(hit.valid,hit);
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          epilog(hit.valid,hit);
+        }
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,hit)) 
+        {
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+
+        MoellerTrumboreHitM<16> hit;
+        MoellerTrumboreIntersector1<16> intersector(ray,nullptr);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) 
+        {
+          vfloat16 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat16(-1.0f),vfloat16(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (likely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        MoellerTrumboreHitM<8> hit;
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+
+    struct MoellerTrumboreIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& tri_v0,
+                                           const Vec3vf<M>& tri_e1,
+                                           const Vec3vf<M>& tri_e2,
+                                           const Vec3vf<M>& tri_Ng,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        /* calculate denominator */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      template<int M, int K, typename Epilog>
+      static __forceinline bool intersect1(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& v0,
+                                           const Vec3vf<M>& v1,
+                                           const Vec3vf<M>& v2,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        const Vec3vf<M> Ng = cross(e2,e1);
+        return intersect(ray,k,v0,e1,e2,Ng,flags,epilog);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumboreBase
+    {
+      __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray.org;
+        const Vec3vf<K> R = cross(C,ray.dir);
+        const vfloat<K> den = dot(tri_Ng,ray.dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog);
+      }
+
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
new file mode 100644
index 0000000000..7ca3aed0a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
@@ -0,0 +1,529 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quad_intersector_moeller.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitPlueckerM
+    {
+      __forceinline QuadHitPlueckerM() {}
+
+      __forceinline QuadHitPlueckerM(const vbool<M>& valid,
+                                     const vfloat<M>& U,
+                                     const vfloat<M>& V,
+                                     const vfloat<M>& UVW,
+                                     const vfloat<M>& t,
+                                     const Vec3vf<M>& Ng,
+                                     const vbool<M>& flags)
+        : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitPlueckerK
+    {
+      __forceinline QuadHitPlueckerK(const vfloat<K>& U,
+                                     const vfloat<K>& V,
+                                     const vfloat<K>& UVW,
+                                     const vfloat<K>& t,
+                                     const Vec3vf<K>& Ng,
+                                     const vbool<K>& flags)
+        : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+        const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    struct PlueckerIntersectorTriangle1
+    {
+      template<int M, typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          const vbool<M>& flags,
+                                          const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid =  (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+         /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+        return epilog(valid,hit);
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1Pluecker
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+        PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true),epilog);
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+        if (PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+        return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    struct PlueckerIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect1(RayK<K>& ray,
+                                            size_t k,
+                                            const Vec3vf<M>& tri_v0,
+                                            const Vec3vf<M>& tri_v1,
+                                            const Vec3vf<M>& tri_v2,
+                                            const vbool<M>& flags,
+                                            const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+          
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKPlueckerBase
+    {
+      __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+           /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+      struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+
+        const vbool16 flags(0xf0f0);
+        return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, 
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi.h b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
new file mode 100644
index 0000000000..741ec519ab
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
@@ -0,0 +1,483 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M quads from an indexed face set */
+  template <int M>
+  struct QuadMi
+  {
+    /* Virtual interface to query information about the quad type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMi(const vuint<M>& v0,
+                         const vuint<M>& v1,
+                         const vuint<M>& v2,
+                         const vuint<M>& v3,
+                         const vuint<M>& geomIDs,
+                         const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+     : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill quad from quad list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+      vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID());
+          const QuadMesh::Quad& q = mesh->quad(prim->primID());
+          unsigned int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = q.v[0] * int_stride;
+          v1[i] = q.v[1] * int_stride;
+          v2[i] = q.v[2] * int_stride;
+          v3[i] = q.v[3] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0]; // always valid geomIDs
+            primID[i] = -1;        // indicates invalid data
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+            v3[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) {
+      return cout << "QuadMi<" << M << ">( "
+#if !defined(EMBREE_COMPACT_POLYS)
+                  << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", "
+#endif
+                  << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )";
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+    vuint<M> v3_;         // 4 byte offset of 4th vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct QuadMi : public embree::QuadMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::QuadMi<M>::v0_;
+    using embree::QuadMi<M>::v1_;
+    using embree::QuadMi<M>::v2_;
+    using embree::QuadMi<M>::v3_;
+#endif
+    using embree::QuadMi<M>::geomIDs;
+    using embree::QuadMi<M>::primIDs;
+    using embree::QuadMi<M>::geomID;
+    using embree::QuadMi<M>::primID;
+    using embree::QuadMi<M>::valid;
+    
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      return (Vec3f) mesh->vertices[0][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const QuadMesh::Quad& quad = mesh->quad(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Quad {
+      vfloat4 v0,v1,v2,v3;
+    };
+
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+    
+#else
+
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+    
+#endif
+
+    /* Gather the quads */
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene) const;
+
+#if defined(__AVX512F__)
+    __forceinline void gather(Vec3vf16& p0,
+                              Vec3vf16& p1,
+                              Vec3vf16& p2,
+                              Vec3vf16& p3,
+                              const Scene *const scene) const;
+#endif
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+      Vec3vf<K>& p0,
+      Vec3vf<K>& p1,
+      Vec3vf<K>& p2,
+      Vec3vf<K>& p3,
+      const size_t index,
+      const Scene* const scene,
+      const vfloat<K>& time) const
+    {
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+        p3 = getVertex<3>(index, scene, itime[first], ftime);
+      }
+      else
+      {
+        p0 = getVertex<0>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2>(valid, index, scene, itime, ftime);
+        p3 = getVertex<3>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const QuadMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene,
+                              const float time) const;
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (!valid(i)) break;
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& q = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(q.v[0]);
+        const Vec3fa p1 = mesh->vertex(q.v[1]);
+        const Vec3fa p2 = mesh->vertex(q.v[2]);
+        const Vec3fa p3 = mesh->vertex(q.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+      }
+      return bounds;
+    }
+
+  private:
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; }
+#endif
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene) const
+  {
+    prefetchL1(((char*)this)+0*64);
+    prefetchL1(((char*)this)+1*64);
+    const Quad tri0 = loadQuad(0,scene);
+    const Quad tri1 = loadQuad(1,scene);
+    const Quad tri2 = loadQuad(2,scene);
+    const Quad tri3 = loadQuad(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const QuadMesh* mesh,
+                                       const Scene *const scene,
+                                       const int itime) const
+  {
+    // FIXME: for trianglei there all geometries are identical, is this the case here too?
+    
+    const Quad tri0 = loadQuad(0,itime,scene);
+    const Quad tri1 = loadQuad(1,itime,scene);
+    const Quad tri2 = loadQuad(2,itime,scene);
+    const Quad tri3 = loadQuad(3,itime,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene,
+                                       const float time) const
+  {
+    const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+    p3 = lerp(a3,b3,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename QuadMi<M>::Type QuadMi<M>::type;
+
+  typedef QuadMi<4> Quad4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
new file mode 100644
index 0000000000..96cf7f1ca2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
@@ -0,0 +1,350 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadi.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv.h b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
new file mode 100644
index 0000000000..0a1fe4d128
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
@@ -0,0 +1,165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M quads in struct of array layout */
+  template <int M>
+  struct QuadMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M> primID()       { return primIDs; }
+    __forceinline const vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2,v3);
+      Vec3vf<M> upper = max(v0,v1,v2,v3);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(QuadMv* dst, const QuadMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vfloat<M>::store_nt(&dst->v3.x,src.v3.x);
+      vfloat<M>::store_nt(&dst->v3.y,src.v3.y);
+      vfloat<M>::store_nt(&dst->v3.z,src.v3.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill quad from quad list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID);
+        const QuadMesh::Quad& quad = mesh->quad(primID);
+        const Vec3fa& p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa& p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa& p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa& p3 = mesh->vertex(quad.v[3]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+	
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& quad = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa p3 = mesh->vertex(quad.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the quads
+    Vec3vf<M> v1;      // 2nd vertex of the quads
+    Vec3vf<M> v2;      // 3rd vertex of the quads
+    Vec3vf<M> v3;      // 4rd vertex of the quads
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename QuadMv<M>::Type QuadMv<M>::type;
+
+  typedef QuadMv<4> Quad4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
new file mode 100644
index 0000000000..30a24b291a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Moeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKMoeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Pluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKPluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
new file mode 100644
index 0000000000..cdf68f486b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
@@ -0,0 +1,710 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+
+/*
+  
+  This file implements the intersection of a ray with a round linear
+  curve segment. We define the geometry of such a round linear curve
+  segment from point p0 with radius r0 to point p1 with radius r1
+  using the cone that touches spheres p0/r0 and p1/r1 tangentially
+  plus the sphere p1/r1. We denote the tangentially touching cone from
+  p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending
+  sphere with cone_sphere(p0,r0,p1,r1).
+
+  For multiple connected round linear curve segments this construction
+  yield a proper shape when viewed from the outside. Using the
+  following CSG we can also handle the interiour in most common cases:
+
+     round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
+       cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
+
+  Thus by subtracting the neighboring cone geometries, we cut away
+  parts of the center cone_sphere surface which lie inside the
+  combined curve. This approach works as long as geometry of the
+  current cone_sphere penetrates into direct neighbor segments only,
+  and not into segments further away.
+  
+  To construct a cone that touches two spheres at p0 and p1 with r0
+  and r1, one has to increase the cone radius at r0 and r1 to obtain
+  larger radii w0 and w1, such that the infinite cone properly touches
+  the spheres.  From the paper "Ray Tracing Generalized Tube
+  Primitives: Method and Applications"
+  (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications)
+  one can derive the following equations for these increased
+  radii:
+
+     sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0))
+     w0 = sr*r0
+     w1 = sr*r1
+
+  Further, we want the cone to start where it touches the sphere at p0
+  and to end where it touches sphere at p1.  Therefore, we need to
+  construct clipping locations y0 and y1 for the start and end of the
+  cone. These start and end clipping location of the cone can get
+  calculated as:
+
+     Y0 =               - r0 * (r1-r0) / length(p1-p0)
+     Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+  Where the cone starts a distance Y0 and ends a distance Y1 away of
+  point p0 along the cone center. The distance between Y1-Y0 can get
+  calculated as:
+
+    dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0)
+
+  In the code below, Y will always be scaled by length(p1-p0) to
+  obtain y and you will find the terms r0*(r1-r0) and
+  (p1-p0)^2-(r1-r0)^2.
+
+ */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct RoundLineIntersectorHitM
+      {
+        __forceinline RoundLineIntersectorHitM() {}
+        
+        __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    namespace __roundline_internal
+    {
+      template<int M>
+        struct ConeGeometry
+        {
+          ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {}
+          
+          /* 
+             
+             This function tests if a point is accepted by first cone
+             clipping plane.
+
+             First, we need to project the point onto the line p0->p1:
+             
+               Y = (p-p0)*(p1-p0)/length(p1-p0)
+             
+             This value y is the distance to the projection point from
+             p0. The clip distances are calculated as:
+             
+               Y0 =               - r0 * (r1-r0) / length(p1-p0)
+               Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+             
+             Thus to test if the point p is accepted by the first
+             clipping plane we need to test Y > Y0 and to test if it
+             is accepted by the second clipping plane we need to test
+             Y < Y1.
+             
+             By multiplying the calculations with length(p1-p0) these
+             calculation can get simplied to:
+             
+               y = (p-p0)*(p1-p0)
+               y0 =           - r0 * (r1-r0)
+               y1 = (p1-p0)^2 - r1 * (r1-r0)
+
+             and the test y > y0 and y < y1.
+             
+          */
+          
+          __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr;
+            const vbool<M> inside_cone = y > cap0;
+            return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone;
+          }
+          
+          /* 
+             
+             This function tests whether a point lies inside the capped cone
+             tangential to its ending spheres.
+
+             Therefore one has to check if the point is inside the
+             region defined by the cone clipping planes, which is
+             performed similar as in the previous function.
+             
+             To perform the inside cone test we need to project the
+             point onto the line p0->p1:
+             
+               dP = p1-p0
+               Y = (p-p0)*dP/length(dP)
+                           
+             This value Y is the distance to the projection point from
+             p0. To obtain a parameter value u going from 0 to 1 along
+             the line p0->p1 we calculate:
+             
+               U = Y/length(dP)
+             
+             The radii to use at points p0 and p1 are:
+             
+               w0 = sr * r0
+               w1 = sr * r1
+               dw = w1-w0
+             
+             Using these radii and u one can directly test if the point
+             lies inside the cone using the formula dP*dP < wy*wy with:
+             
+               wy = w0 + u*dw
+               py = p0 + u*dP - p
+                          
+             By multiplying the calculations with length(p1-p0) and
+             inserting the definition of w can obtain simpler equations:
+             
+               y = (p-p0)*dP
+               ry = r0 + y/dP^2 * dr
+               wy = sr*ry        
+               py = p0 + y/dP^2*dP - p
+               y0 =      - r0 * dr
+               y1 = dP^2 - r1 * dr
+             
+             Thus for the in-cone test we get:
+             
+                    py^2 < wy^2
+               <=>  py^2 < sr^2 * ry^2
+               <=>  py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2
+             
+             This can further get simplified to:
+             
+               (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y;            
+                      
+          */
+          
+          __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp);
+            const vfloat<M> cap1 = -r1*dr + dPdP;
+            
+            vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf));
+            inside_cone &= y > cap0;  // start clipping plane
+            inside_cone &= y < cap1;  // end clipping plane 
+            inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test
+            return inside_cone;
+          }
+          
+        protected:
+          Vec3vf<M> p0;
+          Vec3vf<M> p1;
+          Vec3vf<M> dP;
+          vfloat<M> dPdP;
+          vfloat<M> r0;
+          vfloat<M> sqr_r0;
+          vfloat<M> r1;
+          vfloat<M> dr;
+          vfloat<M> drdr;
+          vfloat<M> r0dr;
+          vfloat<M> g;
+        };
+      
+      template<int M>
+        struct ConeGeometryIntersector : public ConeGeometry<M>
+      {
+        using ConeGeometry<M>::p0;
+        using ConeGeometry<M>::p1;
+        using ConeGeometry<M>::dP;
+        using ConeGeometry<M>::dPdP;
+        using ConeGeometry<M>::r0;
+        using ConeGeometry<M>::sqr_r0;
+        using ConeGeometry<M>::r1;
+        using ConeGeometry<M>::dr;
+        using ConeGeometry<M>::r0dr;
+        using ConeGeometry<M>::g;
+        
+        ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir),  dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)),  yp(OdP + r0dr) {}
+        
+        /*
+          
+          This function intersects a ray with a cone that touches a
+          start sphere p0/r0 and end sphere p1/r1.
+          
+          To find this ray/cone intersections one could just
+          calculate radii w0 and w1 as described above and use a
+          standard ray/cone intersection routine with these
+          radii. However, it turns out that calculations can get
+          simplified when deriving a specialized ray/cone
+          intersection for this special case. We perform
+          calculations relative to the cone origin p0 and define:
+            
+            O  = ray_org - p0
+            dO = ray_dir
+            dP = p1-p0
+            dr = r1-r0
+            dw = w1-w0
+            
+          For some t we can compute the potential hit point h = O + t*dO and
+          project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In
+          case of an intersection, the squared distance from the hit point
+          projected onto the cone center line to the hit point should be equal
+          to the squared cone radius at u:
+            
+            (u*dP - h)^2 = (w0 + u*dw)^2
+           
+          Inserting the definition of h, u, w0, and dw into this formula, then
+          factoring out all terms, and sorting by t^2, t^1, and t^0 terms
+          yields a quadratic equation to solve.
+            
+          Inserting u:
+            ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2
+            
+          Multiplying by dP^4:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2
+            
+          Inserting w0 and dw:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2)
+            ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2
+            
+          Now one can insert the definition of h, factor out, and presort by t:
+            ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2
+            ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2
+            
+          Factoring out further and sorting by t^2, t^1 and t^0 yields:
+            
+            0 =   t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ]
+              + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ]
+              +   t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ]
+            
+          This can be simplified to:
+            
+             0 =   t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ]
+               + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ]
+               +   t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ]
+            
+          Solving this quadratic equation yields the values for t at which the
+          ray intersects the cone.
+          
+        */
+        
+        __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* return no hit by default */
+          lower = pos_inf;
+          upper = neg_inf;
+          
+          /* compute quadratic equation A*t^2 + B*t + C = 0 */
+          const vfloat<M> OO = dot(O,O);
+          const vfloat<M> OdO = dot(dO,O);
+          const vfloat<M> A = g * dOdO - sqr(dOdP);
+          const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp);
+          const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP;
+          
+          /* we miss the cone if determinant is smaller than zero */
+          const vfloat<M> D = B*B - 4.0f*A*C;
+          valid &= (D >= 0.0f & g > 0.0f);  // if g <= 0 then the cone is inside a sphere end
+          
+          /* When rays are parallel to the cone surface, then the
+           * ray may be inside or outside the cone. We just assume a
+           * miss in that case, which is fine as rays inside the
+           * cone would anyway hit the ending spheres in that
+           * case. */
+          valid &= abs(A) > min_rcp_input;
+          if (unlikely(none(valid))) {
+            return false;
+          }
+          
+          /* compute distance to front and back hit */
+          const vfloat<M> Q = sqrt(D);
+          const vfloat<M> rcp_2A = rcp(2.0f*A);
+          t_cone_front = (-B-Q)*rcp_2A;
+          y_cone_front = yp + t_cone_front*dOdP;
+          lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf));
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+          t_cone_back = (-B+Q)*rcp_2A;
+          y_cone_back  = yp + t_cone_back *dOdP;
+          upper = select( (y_cone_back  > -(float)ulp) & (y_cone_back  <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf));
+#endif          
+          return true;
+        }
+        
+        /* 
+           This function intersects the ray with the end sphere at
+           p1. We already clip away hits that are inside the
+           neighboring cone segment.
+           
+        */
+        
+        __forceinline void intersectEndSphere(vbool<M>& valid, 
+                                              const ConeGeometry<M>& coneR, 
+                                              vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p1;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph1_front = (-O1dO - rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_front = org + t_sph1_front*dO;
+          vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front);
+          lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf));
+          
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph1_back  = (-O1dO + rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_back = org + t_sph1_back*dO;
+          vbool<M> valid_sph1_back  = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back);
+          upper = select(valid_sph1_back, t_sph1_back,  vfloat<M>(neg_inf));
+#else
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+
+        __forceinline void intersectBeginSphere(const vbool<M>& valid, 
+                                                vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p0;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph0_front = (-O1dO - rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0;
+          lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph0_back  = (-O1dO + rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_back  = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0;
+          upper = select(valid_sph1_back, t_sph0_back,  vfloat<M>(neg_inf));
+#else   
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+        
+        /* 
+           
+           This function calculates the geometry normal of some cone hit.
+           
+           For a given hit point h (relative to p0) with a cone
+           starting at p0 with radius w0 and ending at p1 with
+           radius w1 one normally calculates the geometry normal by
+           first calculating the parmetric u hit location along the
+           cone:
+           
+             u = dot(h,dP)/dP^2
+           
+           Using this value one can now directly calculate the
+           geometry normal by bending the connection vector (h-u*dP)
+           from hit to projected hit with some cone dependent value
+           dw/sqrt(dP^2) * normalize(dP):
+           
+             Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP)
+           
+           The length of the vector (h-u*dP) can also get calculated
+           by interpolating the radii as w0+u*dw which yields:
+           
+             Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP
+           
+           Multiplying with (w0+u*dw) yield a scaled Ng':
+           
+             Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
+           
+           Inserting the definition of w0 and dw and refactoring
+           yield a furhter scaled Ng'':
+           
+             Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
+           
+           Now inserting the definition of u gives and multiplying
+           with the denominator yields:
+           
+             Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP
+           
+           Factoring out, cancelling terms, dividing by dP^2, and
+           factoring again yields finally:
+           
+             Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr)
+           
+        */
+        
+        __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back);
+          const Vec3vf<M> h = O + t*dO;
+          return g*h-dP*y;
+#else
+          const Vec3vf<M> h = O + t_cone_front*dO;
+          return g*h-dP*y_cone_front;
+#endif
+        }
+        
+        /* compute geometry normal of sphere hit as the difference
+         * vector from hit point to sphere center */
+        
+        __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back);
+          return org+t_sph1*dO-p1;
+#else 
+          return org+t_sph1_front*dO-p1;
+#endif
+        }
+
+        __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back);
+          return org+t_sph0*dO-p0;
+#else
+          return org+t_sph0_front*dO-p0;
+#endif
+        }
+        
+        /* 
+           This function calculates the u coordinate of a
+           hit. Therefore we use the hit distance y (which is zero
+           at the first cone clipping plane) and divide by distance
+           g between the clipping planes.
+           
+        */
+        
+        __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          return clamp(y*rcp(g));
+#else
+          return clamp(y_cone_front*rcp(g));
+#endif
+        }
+        
+      private:
+        Vec3vf<M> org;
+        Vec3vf<M> O;
+        Vec3vf<M> dO;
+        vfloat<M> dOdO;
+        vfloat<M> rcp_dOdO;
+        vfloat<M> OdP;
+        vfloat<M> dOdP;
+        
+        /* for ray/cone intersection */
+      private:
+        vfloat<M> yp;
+        vfloat<M> y_cone_front;
+        vfloat<M> t_cone_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> y_cone_back;
+        vfloat<M> t_cone_back;
+#endif
+        
+        /* for ray/sphere intersection */
+      private:
+        vfloat<M> t_sph1_front;
+        vfloat<M> t_sph0_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph1_back;
+        vfloat<M> t_sph0_back;
+#endif
+      };
+      
+      
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectConeSphere(const vbool<M>& valid_i,
+                                                      const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                      const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                      const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                      const Vec4vf<M>& vL, const Vec4vf<M>& vR,
+                                                      const Epilog& epilog)
+      {         
+        vbool<M> valid = valid_i;
+        
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+        
+        /* intersect with cone from v0 to v1 */
+        vfloat<M> t_cone_lower, t_cone_upper;
+        ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1);
+        vbool<M> validCone = valid;
+        cone.intersectCone(validCone, t_cone_lower, t_cone_upper);
+
+        valid &= (validCone | (cone.g <= 0.0f));  // if cone is entirely in sphere end - check sphere
+        if (unlikely(none(valid)))
+          return false;
+        
+        /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */
+        const ConeGeometry<M> coneL (v0, vL);
+        const ConeGeometry<M> coneR (v1, vR);
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+        const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir;
+        const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir;
+        t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf));
+        t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf));
+#endif
+
+        /* intersect ending sphere */
+        vfloat<M> t_sph1_lower, t_sph1_upper;
+        vfloat<M> t_sph0_lower = vfloat<M>(pos_inf);
+        vfloat<M> t_sph0_upper = vfloat<M>(neg_inf);
+        cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper);
+
+        const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf));
+        if (unlikely(any(isBeginPoint))) {
+          cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper);
+        }
+        
+        /* CSG union of cone and end sphere */
+        vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower);
+        vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower);
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper);
+        vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper);
+        
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf);
+        
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+        
+        /* construct first hit */
+        const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper);
+        const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+        
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_cone_sphere_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false)));
+        const vfloat<M> u_second  = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+#else
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        
+        /* check if there is a valid hit */
+        if (unlikely(none(valid_lower)))
+          return false;
+        
+        /* construct first hit */
+        const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first);
+        const bool is_hit_first = epilog(valid_lower, hit);
+        
+        return is_hit_first;
+#endif
+      }
+      
+    } // end namespace __roundline_internal
+    
+    template<int M>
+      struct RoundLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+          return  __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,vL,vR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct RoundLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+          return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
new file mode 100644
index 0000000000..079817335e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
@@ -0,0 +1,136 @@
+// ======================================================================== //
+// Copyright 2009-2020 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "roundline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct RoundLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct RoundLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct RoundLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct RoundLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
new file mode 100644
index 0000000000..3ab90c29ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
@@ -0,0 +1,183 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct SphereIntersectorHitM
+    {
+      __forceinline SphereIntersectorHitM() {}
+
+      __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng)
+        : vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const {
+        return Vec2f(0.0f, 0.0f);
+      }
+      __forceinline float t(const size_t i) const {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct SphereIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i, Ray& ray,
+          const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const vfloat<M> rd2    = rcp(dot(ray.dir, ray.dir));
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
+        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+        const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+        const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
+      {
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        return intersect(valid_i,ray,pre,v0,epilog);
+      }
+    };
+
+    template<int M, int K>
+    struct SphereIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray, size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
+        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
new file mode 100644
index 0000000000..1146847602
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intersector_epilog.h"
+#include "pointi.h"
+#include "sphere_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct SphereMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct SphereMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct SphereMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct SphereMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
new file mode 100644
index 0000000000..94ad46ad87
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "../subdiv/subdivpatch1base.h"
+
+namespace embree
+{
+
+  struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    
+    static Type type;
+
+  public:
+
+    /*! constructor for cached subdiv patch */
+    SubdivPatch1 (const unsigned int gID,
+                        const unsigned int pID,
+                        const unsigned int subPatch,
+                        const SubdivMesh *const mesh,
+                        const size_t time,
+                        const Vec2f uv[4],
+                        const float edge_level[4],
+                        const int subdiv[4],
+                        const int simd_width) 
+      : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {}
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
new file mode 100644
index 0000000000..74ec1de258
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subdivpatch1.h"
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class SubdivPatch1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    class SubdivPatch1MBIntersector1
+    {
+    public:
+      typedef SubdivPatch1 Primitive;
+      typedef GridSOAMBIntersector1::Precalculations Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = nullptr;
+        grid = (GridSOA*) prim->root_ref.get();
+        pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime);
+        lazy_node = grid->root(pre.itime);
+        pre.grid = grid;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    template <int K>
+      struct SubdivPatch1IntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+      
+      template<bool robust>        
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<bool robust>        
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>              
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef SubdivPatch1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16;
+
+    template <int K>
+      struct SubdivPatch1MBIntersectorK
+    {
+      typedef SubdivPatch1 Primitive;
+      //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = (GridSOA*) prim->root_ref.get();
+        lazy_node = grid->troot;
+        pre.grid = grid;
+        return false;
+      }
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>      
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>      
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1MBIntersectorK<4>  SubdivPatch1MBIntersector4;
+    typedef SubdivPatch1MBIntersectorK<8>  SubdivPatch1MBIntersector8;
+    typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
new file mode 100644
index 0000000000..39fa6fb0f0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_grid_mesh.h"
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+    /* Stores M quads from an indexed face set */
+      struct SubGrid
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        /* primitive supports multiple time segments */
+        static const bool singleTimeSegment = false;
+
+        /* Returns maximum number of stored quads */
+        static __forceinline size_t max_size() { return 1; }
+
+        /* Returns required number of primitive blocks for N primitives */
+        static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+      public:
+
+        /* Default constructor */
+        __forceinline SubGrid() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGrid(const unsigned int x,
+                              const unsigned int y,
+                              const unsigned int geomID,
+                              const unsigned int primID)
+          : _x(x), _y(y), _geomID(geomID), _primID(primID)
+        {
+        }
+
+        __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); }
+        __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const GridMesh* const mesh,
+                                  const GridMesh::Grid &g) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = vfloat4::loadu(mesh->vertexPtr(vtxID00));
+          const vfloat4 vtx01  = vfloat4::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = vfloat4::loadu(mesh->vertexPtr(vtxID10));
+          const vfloat4 vtx11  = vfloat4::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = vfloat4::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = vfloat4::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = vfloat4::loadu(mesh->vertexPtr(vtxID20));
+          const vfloat4 vtx21  = vfloat4::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = vfloat4::loadu(mesh->vertexPtr(vtxID22));
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+        template<typename T>
+        __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const
+        {
+          const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0));
+          const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1));
+          return lerp(v0,v1,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const GridMesh* const mesh,
+                                    const GridMesh::Grid &g,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const Scene *const scene) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gather(p0,p1,p2,p3,mesh,g);
+        }
+
+        /* Gather the quads in the motion blur case */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const Scene *const scene,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const Vec3fa vtx00  = Vec3fa::loadu(mesh->vertexPtr(vtxID00));
+          const Vec3fa vtx01  = Vec3fa::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const Vec3fa vtx10  = Vec3fa::loadu(mesh->vertexPtr(vtxID10));
+          const Vec3fa vtx11  = Vec3fa::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const Vec3fa vtx02  = Vec3fa::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const Vec3fa vtx12  = Vec3fa::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const Vec3fa vtx20  = Vec3fa::loadu(mesh->vertexPtr(vtxID20));
+          const Vec3fa vtx21  = Vec3fa::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const Vec3fa vtx22  = Vec3fa::loadu(mesh->vertexPtr(vtxID22));
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }        
+          
+
+        /* Calculate the bounds of the subgrid */
+        __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+        {
+          BBox3fa bounds = empty;
+          FATAL("not implemented yet");
+          return bounds;
+        }
+
+        /* Calculate the linear bounds of the primitive */
+        __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime)
+        {
+          return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
+          return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID() const { return _primID; }
+        __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; }
+        __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; }
+
+      private:
+        unsigned short _x;
+        unsigned short _y;
+        unsigned int _geomID;    // geometry ID of mesh
+        unsigned int _primID;    // primitive ID of primitive inside mesh
+      };
+
+      struct SubGridID {
+        unsigned short x;
+        unsigned short y;
+        unsigned int primID;
+        
+        __forceinline SubGridID() {}
+        __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
+        x(x), y(y), primID(primID) {}        
+      };
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridQBVHN(const unsigned int x[N],
+                                   const unsigned int y[N],
+                                   const unsigned int primID[N],
+                                   const BBox3fa * const subGridBounds,
+                                   const unsigned int geomID,
+                                   const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+
+          __aligned(64) typename BVHN<N>::AABBNode node;
+          node.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node.setBounds(i,subGridBounds[i]);
+          }
+          qnode.init_dim(node);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNode qnode;
+
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) {
+          cout << "SubGridQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID " << sg._geomID << embree_endl;
+          cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+      template<int N>
+        typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type;
+
+      typedef SubGridQBVHN<4> SubGridQBVH4;
+      typedef SubGridQBVHN<8> SubGridQBVH8;
+
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridMBQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridMBQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridMBQBVHN(const unsigned int x[N],
+                                     const unsigned int y[N],
+                                     const unsigned int primID[N],
+                                     const BBox3fa * const subGridBounds0,
+                                     const BBox3fa * const subGridBounds1,
+                                     const unsigned int geomID,
+                                     const float toffset,
+                                     const float tscale,
+                                     const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+          time_offset = toffset;
+          time_scale  = tscale;
+
+          __aligned(64) typename BVHN<N>::AABBNode node0,node1;
+          node0.clear();          
+          node1.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node0.setBounds(i,subGridBounds0[i]);
+            node1.setBounds(i,subGridBounds1[i]);
+          }
+          qnode.node0.init_dim(node0);
+          qnode.node1.init_dim(node1);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+        __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); }
+
+        template<int K>
+        __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNodeMB qnode;
+
+        float time_offset;
+        float time_scale;
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) {
+          cout << "SubGridMBQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID      " << sg._geomID << embree_endl;
+          cout << "time_offset " << sg.time_offset << embree_endl;
+          cout << "time_scale  " << sg.time_scale << embree_endl;         
+          cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl;
+          cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
new file mode 100644
index 0000000000..045eee4329
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "subgrid_intersector_moeller.h"
+#include "subgrid_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    // =======================================================================================
+    // =================================== SubGridIntersectors ===============================
+    // =======================================================================================
+
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Moeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        assert(accel);
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+    
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Pluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+#if defined(__AVX__)
+          STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKMoeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKPluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
new file mode 100644
index 0000000000..f65b4abf61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
@@ -0,0 +1,493 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+    template<int M>
+      __forceinline void interpolateUV(MoellerTrumboreHitM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + vint<M>(0,1,1,0));
+      const vint<M> syM(sy + vint<M>(0,0,1,1));
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
+      hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore
+      {
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          MoellerTrumboreHitM<M> hit;
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid);
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid);
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          MoellerTrumboreHitM<M> hit;
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid);
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid);
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        MoellerTrumboreHitM<8> hit;
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          /* correct U,V interpolation across the entire grid */
+          const vint8 sx((int)subgrid.x());
+          const vint8 sy((int)subgrid.y());
+          const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+          const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+          const float inv_resX = rcp((float)((int)g.resX-1));
+          const float inv_resY = rcp((float)((int)g.resY-1));          
+          hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+          hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;          
+
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+    // ============================================================================================================================
+    // ============================================================================================================================
+    // ============================================================================================================================
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+      struct SubGridQuadHitK
+      {
+        __forceinline SubGridQuadHitK(const vfloat<K>& U,
+                                      const vfloat<K>& V,
+                                      const vfloat<K>& T,
+                                      const vfloat<K>& absDen,
+                                      const Vec3vf<K>& Ng,
+                                      const vbool<K>& flags,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid& subgrid,
+                                      const unsigned int i)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+        __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+        {
+          const vfloat<K> rcpAbsDen = rcp(absDen);
+          const vfloat<K> t = T * rcpAbsDen;
+          const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+          const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+          const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+          const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+          const vfloat<K> uu = select(flags,u1,u0);
+          const vfloat<K> vv = select(flags,v1,v0);
+          const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+          const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+          const float inv_resX = rcp((float)(int)(g.resX-1));
+          const float inv_resY = rcp((float)(int)(g.resY-1));
+          const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+          const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+          const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+          return std::make_tuple(u,v,t,Ng);
+        }
+
+      private:
+        const vfloat<K> U;
+        const vfloat<K> V;
+        const vfloat<K> T;
+        const vfloat<K> absDen;
+        const vbool<K> flags;
+        const Vec3vf<K> tri_Ng;
+
+        const GridMesh::Grid &g;
+        const SubGrid& subgrid;
+        const size_t i;
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumboreBase
+      {
+        __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_e1,
+                                          const Vec3vf<K>& tri_e2,
+                                          const Vec3vf<K>& tri_Ng,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        { 
+          /* calculate denominator */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> C = tri_v0 - ray.org;
+          const Vec3vf<K> R = cross(C,ray.dir);
+          const vfloat<K> den = dot(tri_Ng,ray.dir);
+          const vfloat<K> absDen = abs(den);
+          const vfloat<K> sgnDen = signmsk(den);
+        
+          /* test against edge p2 p0 */
+          const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+          valid &= U >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* test against edge p0 p1 */
+          const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+          valid &= V >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* test against edge p1 p2 */
+          const vfloat<K> W = absDen-U-V;
+          valid &= W >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* perform depth test */
+          const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+          valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+          if (unlikely(none(valid))) return false;
+        
+          /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= den < vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+#else
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+#endif
+        
+          /* calculate hit information */
+          SubGridQuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags,g,subgrid,i);
+          return epilog(valid,hit);
+        }
+      
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_v1,
+                                          const Vec3vf<K>& tri_v2,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        {
+          const Vec3vf<K> e1 = tri_v0-tri_v1;
+          const Vec3vf<K> e2 = tri_v2-tri_v0;
+          const Vec3vf<K> Ng = cross(e2,e1);
+          return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,g,subgrid,i,epilog);
+        }
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid0, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+          intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+          if (none(valid0)) return true;
+          intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+          return none(valid0);
+        }
+
+        static  __forceinline bool intersect1(RayK<K>& ray,
+                                              size_t k,
+                                              const Vec3vf<M>& tri_v0,
+                                              const Vec3vf<M>& tri_e1,
+                                              const Vec3vf<M>& tri_e2,
+                                              const Vec3vf<M>& tri_Ng,
+                                              MoellerTrumboreHitM<M> &hit)
+        {
+          /* calculate denominator */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+          const Vec3vf<M> R = cross(C,D);
+          const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+          const vfloat<M> absDen = abs(den);
+          const vfloat<M> sgnDen = signmsk(den);
+        
+          /* perform edge tests */
+          const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+          const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+          /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+          vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+          if (likely(none(valid))) return false;
+        
+          /* perform depth test */
+          const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+          valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+          if (likely(none(valid))) return false;
+        
+          /* calculate hit information */
+          new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+          return true;
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             MoellerTrumboreHitM<M> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+          const Vec3vf<M> Ng = cross(e2,e1);
+          return intersect1(ray,k,v0,e1,e2,Ng,hit);
+        }
+
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        MoellerTrumboreHitM<4> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+        {
+          interpolateUV<M>(hit,g,subgrid);
+          epilog(hit.valid,hit);
+        }
+
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          interpolateUV<M>(hit,g,subgrid);
+          epilog(hit.valid,hit);
+        }
+
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        MoellerTrumboreHitM<4> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+        {
+          interpolateUV<M>(hit,g,subgrid);
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          interpolateUV<M>(hit,g,subgrid);
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+
+        MoellerTrumboreHitM<8> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+
+          /* correct U,V interpolation across the entire grid */
+          const vint8 sx((int)subgrid.x());
+          const vint8 sy((int)subgrid.y());
+          const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+          const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+          const float inv_resX = rcp((float)((int)g.resX-1));
+          const float inv_resY = rcp((float)((int)g.resY-1));          
+          hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+          hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;          
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
new file mode 100644
index 0000000000..1cd88aa799
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
@@ -0,0 +1,508 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    template<int M>
+    struct SubGridQuadHitPlueckerM
+    {
+      __forceinline SubGridQuadHitPlueckerM() {}
+
+      __forceinline SubGridQuadHitPlueckerM(const vbool<M>& valid,
+                                            const vfloat<M>& U,
+                                            const vfloat<M>& V,
+                                            const vfloat<M>& UVW,
+                                            const vfloat<M>& t,
+                                            const Vec3vf<M>& Ng,
+                                            const vbool<M>& flags) : valid(valid), vt(t)
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(Ng.x,Ng.y,Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*Ng.x,flip*Ng.y,flip*Ng.z);
+#endif
+      }
+
+      __forceinline void finalize()
+      {
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+      __forceinline void interpolateUV(SubGridQuadHitPlueckerM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX);
+      const vint<M> syM(sy + stepY);
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.vu = (hit.vu + vfloat<M>(sxM)) * inv_resX;
+      hit.vv = (hit.vv + vfloat<M>(syM)) * inv_resY;
+    }
+
+    template<int M>
+    __forceinline static bool intersectPluecker(Ray& ray,
+                                                const Vec3vf<M>& tri_v0,
+                                                const Vec3vf<M>& tri_v1,
+                                                const Vec3vf<M>& tri_v2,
+                                                const vbool<M>& flags,
+                                                SubGridQuadHitPlueckerM<M> &hit)
+    {
+        /* calculate vertices relative to ray origin */
+      const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+      const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,Ng,flags);
+        return true;
+      }
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker
+      {
+        __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+        __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          SubGridQuadHitPlueckerM<M> hit;
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          SubGridQuadHitPlueckerM<M> hit;
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+      __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        SubGridQuadHitPlueckerM<8> hit;
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersectPluecker(ray,vtx0,vtx1,vtx2,flags,hit)))
+        {
+          /* correct U,V interpolation across the entire grid */
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));            
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+      struct SubGridQuadHitPlueckerK
+      {
+         __forceinline SubGridQuadHitPlueckerK(const vfloat<K>& U,
+                                               const vfloat<K>& V,
+                                               const vfloat<K>& UVW,
+                                               const vfloat<K>& t,
+                                               const Vec3vf<K>& Ng,
+                                               const vbool<K>& flags,
+                                               const GridMesh::Grid &g, 
+                                               const SubGrid& subgrid,
+                                               const unsigned int i)
+         : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+        __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+        {
+          const vbool<K> invalid = abs(UVW) < min_rcp_input;
+          const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+          const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+          const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+          const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+          const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+          const vfloat<K> uu = select(flags,u1,u0);
+          const vfloat<K> vv = select(flags,v1,v0);
+          const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+          const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+          const float inv_resX = rcp((float)(int)(g.resX-1));
+          const float inv_resY = rcp((float)(int)(g.resY-1));
+          const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+          const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+          const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+          return std::make_tuple(u,v,t,Ng);
+        }
+
+      private:
+        const vfloat<K> U;
+        const vfloat<K> V;
+        const vfloat<K> UVW;
+        const vfloat<K> t;
+        const vfloat<K> absDen;
+        const vbool<K> flags;
+        const Vec3vf<K> tri_Ng;
+
+        const GridMesh::Grid &g;
+        const SubGrid& subgrid;
+        const size_t i;
+      };
+
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPlueckerBase
+      {
+        __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_v1,
+                                          const Vec3vf<K>& tri_v2,
+                                          const Vec3vf<K>& tri_Ng,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        { 
+          /* calculate denominator */
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          SubGridQuadHitPlueckerK<K> hit(U,V,UVW,t,tri_Ng,flags,g,subgrid,i);
+          return epilog(valid,hit);
+        }
+      
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& v0,
+                                          const Vec3vf<K>& v1,
+                                          const Vec3vf<K>& v2,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        {
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v2-v0;
+          const Vec3vf<K> Ng = cross(e2,e1);
+          return intersectK(valid0,ray,v0,v1,v2,Ng,flags,g,subgrid,i,epilog);
+        }
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid0, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+          intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+          if (none(valid0)) return true;
+          intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+          return none(valid0);
+        }
+
+        static  __forceinline bool intersect1(RayK<K>& ray,
+                                              size_t k,
+                                              const Vec3vf<M>& tri_v0,
+                                              const Vec3vf<M>& tri_v1,
+                                              const Vec3vf<M>& tri_v2,
+                                              const Vec3vf<M>& tri_Ng,
+                                              const vbool<M>& flags,
+                                              SubGridQuadHitPlueckerM<M> &hit)
+        {
+          /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+          
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps ;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,tri_Ng,flags);
+          return true;
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             const vbool<M>& flags,
+                                             SubGridQuadHitPlueckerM<M> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+          const Vec3vf<M> Ng = cross(e2,e1); // FIXME: optimize!!!
+          return intersect1(ray,k,v0,v1,v2,Ng,flags,hit);
+        }
+
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        SubGridQuadHitPlueckerM<4> hit;
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          epilog(hit.valid,hit);
+        }
+
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          epilog(hit.valid,hit);
+        }
+
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        SubGridQuadHitPlueckerM<4> hit;
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
new file mode 100644
index 0000000000..400a88b985
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, bool filter>
+    struct SubGridMBIntersector1Pluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          const float time = prim[i].adjustTime(ray.time());
+
+          assert(time <= 1.0f);
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          const float time = prim[i].adjustTime(ray.time());
+          assert(time <= 1.0f);
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridMBIntersectorKPluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          intersect(pre,ray,ID,context,subgrid);
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          if (occluded(pre,ray,ID,context,subgrid))
+            clear(valid0,ID);
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ 
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].adjustTime(ray.time());
+
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].adjustTime(ray.time());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+          
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle.h b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
new file mode 100644
index 0000000000..0dedf6dc4c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
@@ -0,0 +1,162 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Precalculated representation for M triangles. Stores for each
+     triangle a base vertex, two edges, and the geometry normal to
+     speed up intersection calculations */
+  template<int M>
+  struct TriangleM
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+    
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleM() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+    
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid()));  }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs;  }
+    __forceinline const vuint<M>& geomID() const { return geomIDs;  }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangle */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> p0 = v0;
+      Vec3vf<M> p1 = v0-e1;
+      Vec3vf<M> p2 = v0+e2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleM* dst, const TriangleM& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->e1.x,src.e1.x);
+      vfloat<M>::store_nt(&dst->e1.y,src.e1.y);
+      vfloat<M>::store_nt(&dst->e1.z,src.e1.z);
+      vfloat<M>::store_nt(&dst->e2.x,src.e2.x);
+      vfloat<M>::store_nt(&dst->e2.y,src.e2.y);
+      vfloat<M>::store_nt(&dst->e2.z,src.e2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+	  for (size_t i=0; i<M; i++)
+      {
+        if (unlikely(geomID(i) == -1)) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+      return bounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // base vertex of the triangles
+    Vec3vf<M> e1;      // 1st edge of the triangles (v0-v1)
+    Vec3vf<M> e2;      // 2nd edge of the triangles (v2-v0)
+  private:
+    vuint<M> geomIDs; // geometry IDs
+    vuint<M> primIDs; // primitive IDs
+  };
+
+  template<int M>
+  typename TriangleM<M>::Type TriangleM<M>::type;
+
+  typedef TriangleM<4> Triangle4;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
new file mode 100644
index 0000000000..125a42c5fe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMIntersector1Moeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+      
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMIntersectorKMoeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT_USER(0,TriangleM<M>::max_size());
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid_i,ray,p0,e1,e2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid0,ray,p0,e1,e2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
new file mode 100644
index 0000000000..b5a8519236
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Moeller
+ *  Trumbore intersector from the paper "Fast, Minimum Storage
+ *  Ray-Triangle Intersection". In contrast to the paper we
+ *  precalculate some factors and factor the calculations differently
+ *  to allow precalculating the cross product e1 x e2. The resulting
+ *  algorithm is similar to the fastest one of the paper "Optimizing
+ *  Ray-Triangle Intersection via Automated Search". */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct MoellerTrumboreHitM
+    {
+      __forceinline MoellerTrumboreHitM() {}
+
+      __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        vu = U * rcpAbsDen;
+        vv = V * rcpAbsDen;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+    
+    template<int M>
+    struct MoellerTrumboreIntersector1
+    {
+      __forceinline MoellerTrumboreIntersector1() {}
+
+      __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {}
+
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_e1,
+                                   const Vec3vf<M>& tri_e2,
+                                   const Vec3vf<M>& tri_Ng,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        vbool<M> valid = valid0;
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */        
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+   
+        
+        /* update hit information */
+        new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+
+        return true;
+      }
+
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        vbool<M> valid = true;
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,hit);
+      }
+      
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,v0,e1,e2,hit);
+      }
+
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(valid,ray,v0,e1,e2,hit);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,v0,e1,e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersect(ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersect(valid,ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+    template<int K>
+    struct MoellerTrumboreHitK
+    {
+      __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct MoellerTrumboreIntersectorK
+    {
+      __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        MoellerTrumboreHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
new file mode 100644
index 0000000000..f1de99d208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "intersector_epilog.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct PlueckerHitM
+    {
+      __forceinline PlueckerHitM(const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), UVW(UVW), mapUV(mapUV), vt(t), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        vu = U * rcpUVW;
+        vv = V * rcpUVW;
+        mapUV(vu,vv);
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> UVW;
+      const UVMapper& mapUV;
+      
+    public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct PlueckerIntersector1
+    {
+      __forceinline PlueckerIntersector1() {}
+
+      __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+	const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+    };
+
+    template<int K, typename UVMapper>
+    struct PlueckerHitK
+    {
+      __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        vfloat<K> u = U * rcpUVW;
+        vfloat<K> v = V * rcpUVW;
+        mapUV(u,v);
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct PlueckerIntersectorK
+    {
+      __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> O = ray.org;
+        const Vec3vf<K> D = ray.dir;
+        const Vec3vf<K> v0 = tri_v0-O;
+        const Vec3vf<K> v1 = tri_v1-O;
+        const Vec3vf<K> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<K> e0 = v2-v0;
+        const Vec3vf<K> e1 = v0-v1;
+        const Vec3vf<K> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+        const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+        const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+        const vfloat<K> UVW = U+V+W;
+        const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+         /* calculate geometry normal and denominator */
+        const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+        /* perform depth test */
+        const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+        const vfloat<K> t = rcp(den)*T;
+        valid &= ray.tnear() <= t & t <= ray.tfar;
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+        
+        /* calculate hit information */
+        PlueckerHitK<K,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+        if (unlikely(none(valid))) return false;
+
+        /* avoid division by 0 */
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
new file mode 100644
index 0000000000..63e649d8fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
@@ -0,0 +1,418 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct WoopHitM
+    {
+      __forceinline WoopHitM() {}
+
+      __forceinline WoopHitM(const vbool<M>& valid, 
+                             const vfloat<M>& U, 
+                             const vfloat<M>& V, 
+                             const vfloat<M>& T, 
+                             const vfloat<M>& inv_det,                              
+                             const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        vt = T;
+        vu = U*inv_det;
+        vv = V*inv_det;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> T;
+      const vfloat<M> inv_det;
+      
+    public:
+      const vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct WoopPrecalculations1
+    {
+      unsigned int kx,ky,kz;
+      Vec3vf<M> org;
+      Vec3fa S;
+      __forceinline WoopPrecalculations1() {}
+
+      __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr)
+      {
+        kz = maxDim(abs(ray.dir));
+        kx = (kz+1) % 3;
+        ky = (kx+1) % 3;
+        const float inv_dir_kz = rcp(ray.dir[kz]);
+        if (ray.dir[kz]) std::swap(kx,ky);
+        S.x = ray.dir[kx] * inv_dir_kz;
+        S.y = ray.dir[ky] * inv_dir_kz;
+        S.z = inv_dir_kz;
+        org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]);
+      }
+    };
+
+    
+    template<int M>
+    struct WoopIntersector1
+    {
+
+        typedef WoopPrecalculations1<M> Precalculations;
+
+      __forceinline WoopIntersector1() {}
+
+      __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {}
+
+      static __forceinline bool intersect(const vbool<M>& valid0,
+                                          Ray& ray,
+                                          const Precalculations& pre,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          WoopHitM<M>& hit)
+      {       
+        vbool<M> valid = valid0;
+
+        /* vertices relative to ray origin */
+        const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z);
+        const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org;
+        const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org;
+        const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org;
+
+        /* shear and scale vertices */
+        const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x);
+        const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y);
+        const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x);
+        const vfloat<M> By = nmadd(B.z,pre.S.y,B.y);
+        const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x);
+        const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y);
+
+        /* scaled barycentric */
+        const vfloat<M> U0 = Cx*By;
+        const vfloat<M> U1 = Cy*Bx;
+        const vfloat<M> V0 = Ax*Cy;
+        const vfloat<M> V1 = Ay*Cx;
+        const vfloat<M> W0 = Bx*Ay;
+        const vfloat<M> W1 = By*Ax;
+#if !defined(__AVX512F__)
+        valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) |
+          (U0 <= U1) & (V0 <= V1) & (W0 <= W1);
+#else
+        valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1);
+#endif
+
+        if (likely(none(valid))) return false;
+        const vfloat<M> U = U0-U1;
+        const vfloat<M> V = V0-V1;
+        const vfloat<M> W = W0-W1;
+
+        const vfloat<M> det = U+V+W;
+
+        valid &= det != 0.0f;
+        const vfloat<M> inv_det = rcp(det);
+
+        const vfloat<M> Az = pre.S.z * A.z;
+        const vfloat<M> Bz = pre.S.z * B.z;
+        const vfloat<M> Cz = pre.S.z * C.z;
+        const vfloat<M> T  = madd(U,Az,madd(V,Bz,W*Cz)); 
+        const vfloat<M> t  = T * inv_det;
+        /* perform depth test */
+        valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+        
+        const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1);
+
+        /* update hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng);
+        return true;
+      }
+      
+      static __forceinline bool intersect(Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   WoopHitM<M>& hit)
+      {
+        vbool<M> valid = true;
+        return intersect(valid,ray,pre,v0,v1,v2,hit);
+      }
+
+
+      template<typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                     const Precalculations& pre,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+#if 0
+    template<int K>
+    struct WoopHitK
+    {
+      __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct WoopIntersectorK
+    {
+      __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        WoopHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       WoopHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       WoopHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
new file mode 100644
index 0000000000..91b35c36f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    struct TriangleTriangleIntersector
+    {
+      __forceinline static float T(float pa0, float pa1, float da0, float da1) {
+        return pa0 + (pa1-pa0)*da0/(da0-da1);
+      }
+      
+      __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) {
+        return det(p-a0,a0-a1) >= 0.0f;
+      }
+      
+      __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) 
+      {
+        const bool pab = point_line_side(p,a,b); 
+        const bool pbc = point_line_side(p,b,c);
+        const bool pca = point_line_side(p,c,a);
+        return pab == pbc && pab == pca;
+      }
+      
+      __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1)
+      {
+        const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1);
+        const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1);
+        return different_sides0 && different_sides1;
+      }
+      
+      __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, 
+                                                             const Vec2f& b0, const Vec2f& b1, const Vec2f& b2)
+      {
+        const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); 
+        if (a01_b01) return true;
+        const bool a01_b12 = intersect_line_line(a0,a1,b1,b2);
+        if (a01_b12) return true;
+        const bool a01_b20 = intersect_line_line(a0,a1,b2,b0);
+        if (a01_b20) return true;
+        const bool a12_b01 = intersect_line_line(a1,a2,b0,b1);
+        if (a12_b01) return true;
+        const bool a12_b12 = intersect_line_line(a1,a2,b1,b2);
+        if (a12_b12) return true;
+        const bool a12_b20 = intersect_line_line(a1,a2,b2,b0);
+        if (a12_b20) return true;
+        const bool a20_b01 = intersect_line_line(a2,a0,b0,b1);
+        if (a20_b01) return true;
+        const bool a20_b12 = intersect_line_line(a2,a0,b1,b2);
+        if (a20_b12) return true;
+        const bool a20_b20 = intersect_line_line(a2,a0,b2,b0);
+        if (a20_b20) return true;
+        
+        bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2);
+        if (a_in_b) return true;
+        
+        bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2);
+        if (b_in_a) return true;
+        
+        return false;
+      }
+      
+      static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2,
+                                               const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2)
+      {
+        const float eps = 1E-5f;
+        
+        /* calculate triangle planes */
+        const Vec3fa Na = cross(a1-a0,a2-a0);
+        const float  Ca = dot(Na,a0);
+        const Vec3fa Nb = cross(b1-b0,b2-b0);
+        const float  Cb = dot(Nb,b0);
+        
+        /* project triangle A onto plane B */
+        const float da0 = dot(Nb,a0)-Cb;
+        const float da1 = dot(Nb,a1)-Cb;
+        const float da2 = dot(Nb,a2)-Cb;
+        if (max(da0,da1,da2) < -eps) return false;
+        if (min(da0,da1,da2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections4++);
+        
+        /* project triangle B onto plane A */
+        const float db0 = dot(Na,b0)-Ca;
+        const float db1 = dot(Na,b1)-Ca;
+        const float db2 = dot(Na,b2)-Ca;
+        if (max(db0,db1,db2) < -eps) return false;
+        if (min(db0,db1,db2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections5++);
+        
+        if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) ||
+                     (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps)))
+        {
+          const size_t dz = maxDim(Na);
+          const size_t dx = (dz+1)%3;
+          const size_t dy = (dx+1)%3;
+          const Vec2f A0(a0[dx],a0[dy]);
+          const Vec2f A1(a1[dx],a1[dy]);
+          const Vec2f A2(a2[dx],a2[dy]);
+          const Vec2f B0(b0[dx],b0[dy]);
+          const Vec2f B1(b1[dx],b1[dy]);
+          const Vec2f B2(b2[dx],b2[dy]);
+          return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2);
+        }
+        
+        const Vec3fa D = cross(Na,Nb);
+        const float pa0 = dot(D,a0);
+        const float pa1 = dot(D,a1);
+        const float pa2 = dot(D,a2);
+        const float pb0 = dot(D,b0);
+        const float pb1 = dot(D,b1);
+        const float pb2 = dot(D,b2);
+        
+        BBox1f ba = empty;
+        if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1));
+        if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2));
+        if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0));
+        
+        BBox1f bb = empty;
+        if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1));
+        if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2));
+        if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0));
+        
+        return conjoint(ba,bb);
+      }
+    };
+  }
+}
+
+  
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
new file mode 100644
index 0000000000..4f3118cc0c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M triangles from an indexed face set */
+  template <int M>
+  struct TriangleMi
+  {
+    /* Virtual interface to query information about the triangle type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMi(const vuint<M>& v0,
+                             const vuint<M>& v1,
+                             const vuint<M>& v2,
+                             const vuint<M>& geomIDs,
+                             const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+    : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline vuint<M> geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+    
+    /* Non-temporal store */
+    __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src)
+    {
+#if !defined(EMBREE_COMPACT_POLYS)
+      vuint<M>::store_nt(&dst->v0_,src.v0_);
+      vuint<M>::store_nt(&dst->v1_,src.v1_);
+      vuint<M>::store_nt(&dst->v2_,src.v2_);
+#endif
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> v0 = zero, v1 = zero, v2 = zero;
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID());
+          const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID());
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride;
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0];
+            primID[i] = -1;
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned int primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+      }
+      return bounds;
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct TriangleMi : public embree::TriangleMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::TriangleMi<M>::v0_;
+    using embree::TriangleMi<M>::v1_;
+    using embree::TriangleMi<M>::v2_;
+#endif
+    using embree::TriangleMi<M>::geomIDs;
+    using embree::TriangleMi<M>::primIDs;
+    using embree::TriangleMi<M>::geomID;
+    using embree::TriangleMi<M>::primID;
+    using embree::TriangleMi<M>::valid;
+        
+    /* loads a single vertex */
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      return (Vec3f) mesh->vertices[0][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Triangle {
+      vfloat4 v0,v1,v2;
+    };
+    
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID);
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]];
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]];
+      return { v0, v1, v2 };
+    }
+    
+#else
+
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+    
+#endif
+
+    /* Gather the triangles */
+    __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const;
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+                Vec3vf<K>& p0,
+                Vec3vf<K>& p1,
+                Vec3vf<K>& p2,
+                const size_t index,
+                const Scene* const scene,
+                const vfloat<K>& time) const
+    {
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+      } else {
+        p0 = getVertex<0>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const TriangleMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const Scene *const scene,
+                              const float time) const;
+
+
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; }
+#endif
+  
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene* const scene) const
+  {
+    const Triangle tri0 = loadTriangle(0,scene);
+    const Triangle tri1 = loadTriangle(1,scene);
+    const Triangle tri2 = loadTriangle(2,scene);
+    const Triangle tri3 = loadTriangle(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const TriangleMesh* mesh,
+                                           const Scene *const scene,
+                                           const int itime) const
+  {
+    const Triangle tri0 = loadTriangle(0,itime,mesh);
+    const Triangle tri1 = loadTriangle(1,itime,mesh);
+    const Triangle tri2 = loadTriangle(2,itime,mesh);
+    const Triangle tri3 = loadTriangle(3,itime,mesh);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene *const scene,
+                                           const float time) const
+  {
+    const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename TriangleMi<M>::Type TriangleMi<M>::type;
+
+  typedef TriangleMi<4> Triangle4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
new file mode 100644
index 0000000000..e2f106a62c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "trianglei.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiMBIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiMBIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiMBIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiMBIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
new file mode 100644
index 0000000000..19af389e73
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
@@ -0,0 +1,157 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template <int M>
+  struct TriangleMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMv<M>::Type TriangleMv<M>::type;
+
+  typedef TriangleMv<4> Triangle4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
new file mode 100644
index 0000000000..6af0d5a11c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
@@ -0,0 +1,206 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_pluecker.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_woop.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Moeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Woop
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef WoopIntersector1<Mx> intersec;
+      typedef WoopPrecalculations1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvIntersectorKMoeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Pluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvIntersectorKPluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
new file mode 100644
index 0000000000..63137aee16
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template<int M>
+  struct TriangleMvMB
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+
+    static Type type;
+
+  public:
+
+    /* primitive supports single time segments */
+    static const bool singleTimeSegment = true;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMvMB() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1,
+                               const Vec3vf<M>& b0, const Vec3vf<M>& b1,
+                               const Vec3vf<M>& c0, const Vec3vf<M>& c1,
+                               const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles at t0 */
+    __forceinline BBox3fa bounds0() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the bounds of the triangles at t1 */
+    __forceinline BBox3fa bounds1() const 
+    {
+      const Vec3vf<M> p0 = v0+dv0;
+      const Vec3vf<M> p1 = v1+dv1;
+      const Vec3vf<M> p2 = v2+dv2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds() const {
+      return LBBox3fa(bounds0(),bounds1());
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      BBox3fa bounds0 = empty;
+      BBox3fa bounds1 = empty;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
+	va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z;
+	vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z;
+	vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z;
+	vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z;
+	vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID);
+        const range<int> itime_range = mesh->timeSegmentRange(time_range);
+        assert(itime_range.size() == 1);
+        const int ilower = itime_range.begin();
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        allBounds.extend(mesh->linearBounds(primID, time_range));
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+        const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
+        auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
+        auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
+        auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z;
+	va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z;
+	vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z;
+	vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z;
+	vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z;
+	vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return allBounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+    Vec3vf<M> dv0;     // difference vector between time steps t0 and t1 for first vertex
+    Vec3vf<M> dv1;     // difference vector between time steps t0 and t1 for second vertex
+    Vec3vf<M> dv2;     // difference vector between time steps t0 and t1 for third vertex
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMvMB<M>::Type TriangleMvMB<M>::type;
+
+  typedef TriangleMvMB<4> Triangle4vMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
new file mode 100644
index 0000000000..35a260d826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
@@ -0,0 +1,211 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvMBIntersector1Moeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvMBIntersectorKMoeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvMBIntersector1Pluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvMBIntersectorKPluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/hash.h b/thirdparty/embree-aarch64/kernels/hash.h
new file mode 100644
index 0000000000..4abbe203d6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/hash.h
@@ -0,0 +1,5 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6"
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h
new file mode 100644
index 0000000000..c0e78820f8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+namespace embree
+{
+  class BezierBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0 * t0 * t0;
+      const T B1 = 3.0f * t1 * (t0 * t0);
+      const T B2 = 3.0f * (t1 * t1) * t0;
+      const T B3 = t1 * t1 * t1;
+      return Vec4<T>(B0,B1,B2,B3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = -(t0*t0);
+      const T B1 = madd(-2.0f,t0*t1,t0*t0);
+      const T B2 = msub(+2.0f,t0*t1,t1*t1);
+      const T B3 = +(t1*t1);
+      return T(3.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0;
+      const T B1 = madd(-2.0f,t0,t1);
+      const T B2 = madd(-2.0f,t1,t0);
+      const T B3 = t1;
+      return T(6.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+  };
+  
+  struct PrecomputedBezierBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBezierBasis() {}
+    PrecomputedBezierBasis(int shift);
+
+    /* basis for bezier evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bezier derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBezierBasis bezier_basis0;
+  extern PrecomputedBezierBasis bezier_basis1;
+
+  
+  template<typename V>
+    struct LinearBezierCurve
+    {
+      V v0,v1;
+      
+      __forceinline LinearBezierCurve () {}
+      
+      __forceinline LinearBezierCurve (const LinearBezierCurve& other)
+        : v0(other.v0), v1(other.v1) {}
+      
+      __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; return *this;
+      }
+        
+        __forceinline LinearBezierCurve (const V& v0, const V& v1)
+          : v0(v0), v1(v1) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v1; }
+      
+      bool hasRoot() const;
+      
+      friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) {
+        return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")";
+      }
+    };
+  
+  template<> __forceinline bool LinearBezierCurve<Interval1f>::hasRoot() const {
+    return numRoots(v0,v1);
+  }
+  
+  template<typename V>
+    struct QuadraticBezierCurve
+    {
+      V v0,v1,v2;
+      
+      __forceinline QuadraticBezierCurve () {}
+      
+      __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other)
+        : v0(other.v0), v1(other.v1), v2(other.v2) {}
+      
+      __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this;
+      }
+        
+        __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2)
+          : v0(v0), v1(v1), v2(v2) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v2; }
+      
+      __forceinline V interval() const {
+        return merge(v0,v1,v2);
+      }
+      
+      __forceinline BBox<V> bounds() const {
+        return merge(BBox<V>(v0),BBox<V>(v1),BBox<V>(v2));
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) {
+        return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")";
+      }
+    };
+  
+  
+  typedef QuadraticBezierCurve<float> QuadraticBezierCurve1f;
+  typedef QuadraticBezierCurve<Vec2fa> QuadraticBezierCurve2fa;
+  typedef QuadraticBezierCurve<Vec3fa> QuadraticBezierCurve3fa;
+
+  template<typename Vertex>
+    struct CubicBezierCurve
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CubicBezierCurve() {}
+
+      template<typename T1>
+      __forceinline CubicBezierCurve (const CubicBezierCurve<T1>& other)
+      : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {}
+      
+      __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this;
+      }
+      
+      __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return v0;
+      }
+
+      __forceinline Vertex end() const {
+        return v3;
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline Vertex begin_direction() const {
+        return v1-v0;
+      }
+
+      __forceinline Vertex end_direction() const {
+        return v3-v2;
+      }
+
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const {
+        return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const {
+        return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const {
+        return CubicBezierCurve<float>(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx));
+      }
+
+       __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0);
+        const Vec3fa q1 = xfmVector(space,v1);
+        const Vec3fa q2 = xfmVector(space,v2);
+        const Vec3fa q3 = xfmVector(space,v3);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0-p);
+        const Vec3fa q1 = xfmVector(space,v1-p);
+        const Vec3fa q2 = xfmVector(space,v2-p);
+        const Vec3fa q3 = xfmVector(space,v3-p);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return CubicBezierCurve<Vec3ff>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const
+      {
+        const Vec3fa q0 = xfmVector(space,s*(v0-p));
+        const Vec3fa q1 = xfmVector(space,s*(v1-p));
+        const Vec3fa q2 = xfmVector(space,s*(v2-p));
+        const Vec3fa q3 = xfmVector(space,s*(v3-p));
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline int maxRoots() const;
+      
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) {
+        return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+      
+      __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3);
+      }
+
+      __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b,  const CubicBezierCurve& c) {
+        return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3));
+      }
+      
+      __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) {
+        return cmadd((Vertex(1.0f)-t),a,t*b);
+      }
+      
+      __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3));
+      }
+      
+      __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        new (&left ) CubicBezierCurve(p00,p10,p20,p30);
+        new (&right) CubicBezierCurve(p30,p21,p12,p03);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split() const
+      {
+        const float u0 = 0.0f, u1 = 1.0f;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split(const BBox1f& u) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline void eval(float t, Vertex& p, Vertex& dp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = Vertex(3.0f)*(p21-p20);
+      }
+
+#if 0
+      __forceinline Vertex eval(float t) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        return p30;
+      }
+#else
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+#endif
+      
+      __forceinline Vertex eval_dt(float t) const
+      {
+        const Vertex p00 = v1-v0;
+        const Vertex p01 = v2-v1;
+        const Vertex p02 = v3-v2;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        return Vertex(3.0f)*p20;
+      }
+
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BezierBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const
+      {
+        const Vec2vfx p00 = v0;
+        const Vec2vfx p01 = v1;
+        const Vec2vfx p02 = v2;
+        const Vec2vfx p03 = v3;
+        
+        const Vec2vfx p10 = lerp(p00,p01,t);
+        const Vec2vfx p11 = lerp(p01,p02,t);
+        const Vec2vfx p12 = lerp(p02,p03,t);
+        
+        const Vec2vfx p20 = lerp(p10,p11,t);
+        const Vec2vfx p21 = lerp(p11,p12,t);
+        
+        const Vec2vfx p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloatx(3.0f)*(p21-p20);
+      }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        p = p30;
+        dp = 3.0f*(p21-p20);
+        ddp = eval_dudu(t);
+      }
+      
+      __forceinline CubicBezierCurve clip(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f1,df1; eval(u1.upper,f1,df1);
+        float s = u1.upper-u1.lower;
+        return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+      }
+      
+      __forceinline QuadraticBezierCurve<Vertex> derivative() const
+      {
+        const Vertex q0 = 3.0f*(v1-v0);
+        const Vertex q1 = 3.0f*(v2-v1);
+        const Vertex q2 = 3.0f*(v3-v2);
+        return QuadraticBezierCurve<Vertex>(q0,q1,q2);
+      }
+      
+      __forceinline BBox<Vertex> derivative_bounds(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f3,df3; eval(u1.upper,f3,df3);
+        const float s = u1.upper-u1.lower;
+        const Vertex f1 = f0+s*(1.0f/3.0f)*df0;
+        const Vertex f2 = f3-s*(1.0f/3.0f)*df3;
+        const Vertex q0 = s*df0;
+        const Vertex q1 = 3.0f*(f2-f1);
+        const Vertex q2 = s*df3;
+        return merge(BBox<Vertex>(q0),BBox<Vertex>(q1),BBox<Vertex>(q2));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        const Vec4vf<M> p00 = v0;
+        const Vec4vf<M> p01 = v1;
+        const Vec4vf<M> p02 = v2;
+        const Vec4vf<M> p03 = v3;
+        
+        const Vec4vf<M> p10 = lerp(p00,p01,t);
+        const Vec4vf<M> p11 = lerp(p01,p02,t);
+        const Vec4vf<M> p12 = lerp(p02,p03,t);
+        const Vec4vf<M> p20 = lerp(p10,p11,t);
+        const Vec4vf<M> p21 = lerp(p11,p12,t);
+        const Vec4vf<M> p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloat<M>(3.0f)*(p21-p20);
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), 
+                    madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3))));
+      }
+
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec3vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec3vfx p  = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero));
+          const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        return BBox3fa(lower,upper);
+      }
+      
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru   = select(valid,max(ru,abs(pi.w)),ru);
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) {
+        return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+#if defined(__AVX__)
+  template<>
+    __forceinline CubicBezierCurve<vfloat4> CubicBezierCurve<vfloat4>::clip(const Interval1f& u1) const
+  {
+    const vfloat8 p00 = vfloat8(v0);
+    const vfloat8 p01 = vfloat8(v1);
+    const vfloat8 p02 = vfloat8(v2);
+    const vfloat8 p03 = vfloat8(v3);
+
+    const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper));
+    const vfloat8 p10 = lerp(p00,p01,t);
+    const vfloat8 p11 = lerp(p01,p02,t);
+    const vfloat8 p12 = lerp(p02,p03,t);
+    const vfloat8 p20 = lerp(p10,p11,t);
+    const vfloat8 p21 = lerp(p11,p12,t);
+    const vfloat8 p30 = lerp(p20,p21,t);
+    
+    const vfloat8 f01  = p30;
+    const vfloat8 df01 = vfloat8(3.0f)*(p21-p20);
+        
+    const vfloat4 f0  = extract4<0>(f01),  f1  = extract4<1>(f01);
+    const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01);
+    const float s = u1.upper-u1.lower;
+    return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+  }
+#endif
+  
+  template<typename Vertex> using BezierCurveT = CubicBezierCurve<Vertex>;
+  
+  typedef CubicBezierCurve<float> CubicBezierCurve1f;
+  typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa;
+  typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa;
+  typedef CubicBezierCurve<Vec3fa> BezierCurve3fa;
+  
+  template<> __forceinline int CubicBezierCurve<float>::maxRoots() const
+  {
+    float eps = 1E-4f;
+    bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps;
+    bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps;
+    bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps;
+    bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps;
+    return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3);
+  }
+  
+  template<> __forceinline int CubicBezierCurve<Interval1f>::maxRoots() const {
+    return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3);
+  }
+
+  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
+  {
+    return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h
new file mode 100644
index 0000000000..d87ed41ccb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h
@@ -0,0 +1,372 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{  
+  template<class T, class S>
+    static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    const T v0_3 = lerp(v0_2,v1_2,uu);
+    return v0_3;
+  }
+  
+  template<class T, class S>
+    static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    return S(3.0f)*(v1_2-v0_2);
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] +  v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1]));
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1]));
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x)
+  {
+    return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]);
+  }
+
+  template<typename Vertex, typename Vertex_t>
+    class __aligned(64) BezierPatchT
+  {
+   public:
+      Vertex matrix[4][4];
+    
+  public:
+
+    __forceinline BezierPatchT() {}
+
+    __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch,
+                               const BezierCurveT<Vertex>* border0,
+                               const BezierCurveT<Vertex>* border1,
+                               const BezierCurveT<Vertex>* border2,
+                               const BezierCurveT<Vertex>* border3);
+                               
+    __forceinline BezierPatchT(const BSplinePatchT<Vertex,Vertex_t>& source)
+    {
+      /* compute inner bezier control points */
+      matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1);
+      matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2);
+      matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2);
+      matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1);
+      
+      /* compute top edge control points */
+      matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
+      matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
+      
+      /* compute buttom edge control points */
+      matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
+      matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute left edge control points */
+      matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1);
+      matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1);
+      
+      /* compute right edge control points */
+      matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2);
+      matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute corner control points */
+      matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1);
+      matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1);
+      matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1);
+      matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1);      
+    }
+
+    static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv)
+    {
+      const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); 
+      const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3])));
+      const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3])));
+      const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])));
+      return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3)));
+    }
+
+    static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) 
+    {      
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative2(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative2(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t normal(const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t eval(const float uu, const float vv) const {
+      return eval(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_du(const float uu, const float vv) const { 
+      return eval_du(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dv(const float uu, const float vv) const {
+      return eval_dv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { 
+      return eval_dudu(matrix,uu,vv);
+    }
+    
+    __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { 
+      return eval_dvdv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { 
+      return eval_dudv(matrix,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]);
+        const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]);
+        const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]);
+        const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]);
+        return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x;
+      }
+
+    template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative2(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+    template<typename T>
+      static __forceinline Vec3<T> eval(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {      
+      const T one_minus_uu = 1.0f - uu;
+      const T one_minus_vv = 1.0f - vv;      
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+      
+      const T x = 
+        madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))),
+        madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))),
+             B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); 
+
+      const T y = 
+        madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))),
+        madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))),
+             B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); 
+      
+      const T z = 
+        madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))),
+        madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))),
+             B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); 
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const {     
+      return eval(matrix,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_11 = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> matrix_12 = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_21 = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      const Vec3<T> matrix_22 = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+            
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {     
+      return normal(matrix,uu,vv);
+    }
+  };
+
+  typedef BezierPatchT<Vec3fa,Vec3fa_t> BezierPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h
new file mode 100644
index 0000000000..35748754bd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BilinearPatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      Vertex v[4];
+      
+    public:
+      
+      __forceinline BilinearPatchT () {}
+
+      __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView<Vertex>& vertices) {
+        init(edge,vertices.getPtr(),vertices.getStride());
+      }
+      
+      __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride)
+      {
+        v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+      }
+
+      __forceinline BilinearPatchT (const CatmullClarkPatch& patch)
+      {
+        v[0] = patch.ring[0].getLimitVertex();
+        v[1] = patch.ring[1].getLimitVertex();
+        v[2] = patch.ring[2].getLimitVertex();
+        v[3] = patch.ring[3].getLimitVertex();
+      }
+
+      __forceinline BBox<Vertex> bounds() const
+      {
+        
+        BBox<Vertex> bounds (v[0]);
+        bounds.extend(v[1]);
+        bounds.extend(v[2]);
+        bounds.extend(v[3]);
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const {
+        return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv);
+      }
+
+      __forceinline Vertex eval_du(const float uu, const float vv) const {
+        return lerp(v[1]-v[0],v[2]-v[3],vv);
+      }
+
+      __forceinline Vertex eval_dv(const float uu, const float vv) const {
+        return lerp(v[3]-v[0],v[2]-v[1],uu);
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const {
+        return (v[2]-v[3]) - (v[1]-v[0]);
+      }
+
+      __forceinline Vertex normal(const float uu, const float vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+      
+      __forceinline void eval(const float u, const float v, 
+                              Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                              const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv);
+        const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv);
+        const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_du(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv);
+        const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv);
+        const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_dv(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu);
+        const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu);
+        const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+       template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu);
+      }
+      
+      template<class vfloat>
+      __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]);
+      }
+
+      template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv));
+        }
+        if (dPdu) {
+          for (size_t i=0; i<N; i++) {
+            assert(dPdu); vfloat::store(valid,dPdu+i*dstride,eval_du(i,uu,vv)*dscale);
+            assert(dPdv); vfloat::store(valid,dPdv+i*dstride,eval_dv(i,uu,vv)*dscale);
+          }
+        }
+        if (ddPdudu) {
+          for (size_t i=0; i<N; i++) {
+            assert(ddPdudu); vfloat::store(valid,ddPdudu+i*dstride,eval_dudu(i,uu,vv)*sqr(dscale));
+            assert(ddPdvdv); vfloat::store(valid,ddPdvdv+i*dstride,eval_dvdv(i,uu,vv)*sqr(dscale));
+            assert(ddPdudv); vfloat::store(valid,ddPdudv+i*dstride,eval_dudv(i,uu,vv)*sqr(dscale));
+          }
+        }
+      }
+    };
+  
+  typedef BilinearPatchT<Vec3fa,Vec3fa_t> BilinearPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h
new file mode 100644
index 0000000000..a325667328
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h
@@ -0,0 +1,319 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  class BSplineBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = s*s*s;
+      const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t));
+      const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s));
+      const T n3 = t*t*t;
+      return T(1.0f/6.0f)*Vec4<T>(n0,n1,n2,n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = -s*s;
+      const T n1 = -t*t - 4.0f*(t*s);
+      const T n2 =  s*s + 4.0f*(s*t);
+      const T n3 =  t*t;
+      return T(0.5f)*Vec4<T>(n0,n1,n2,n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = s;
+      const T n1 = t - 2.0f*s;
+      const T n2 = s - 2.0f*t;
+      const T n3 = t;
+      return Vec4<T>(n0,n1,n2,n3);
+    }
+  };
+  
+  struct PrecomputedBSplineBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBSplineBasis() {}
+    PrecomputedBSplineBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBSplineBasis bspline_basis0;
+  extern PrecomputedBSplineBasis bspline_basis1;
+
+  template<typename Vertex>
+    struct BSplineCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline BSplineCurveT() {}
+      
+      __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) {
+        return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline BSplineCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return BSplineCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BSplineBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval(t);
+        dp = veval_du(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of bspline curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) {
+        return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2));
+    const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2);
+    const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1);
+    const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3));
+    ocurve = BSplineCurveT<Vertex>(v0,v1,v2,v3);
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2));
+    const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2);
+    const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2);
+    const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3));
+    ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3);
+  }
+
+  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
+  {
+    return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef BSplineCurveT<Vec3fa> BSplineCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h
new file mode 100644
index 0000000000..9769bc17bd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h
@@ -0,0 +1,449 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bspline_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BSplinePatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      
+      __forceinline BSplinePatchT () {}
+
+      __forceinline BSplinePatchT (const CatmullClarkPatch& patch) {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT(const CatmullClarkPatch& patch,
+                                  const BezierCurveT<Vertex>* border0,
+                                  const BezierCurveT<Vertex>* border1,
+                                  const BezierCurveT<Vertex>* border2,
+                                  const BezierCurveT<Vertex>* border3)
+      {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline Vertex hard_corner(const                    Vertex& v01, const Vertex& v02, 
+                                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return 4.0f*v11 - 2.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex soft_convex_corner( const                    Vertex& v01, const Vertex& v02, 
+                                               const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                               const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return -8.0f*v11 + 4.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex convex_corner(const float vertex_crease_weight, 
+                                         const                    Vertex& v01, const Vertex& v02, 
+                                         const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                         const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+        else                                  return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+      }
+
+      __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride);
+      }
+
+      __forceinline void init_border(const CatmullClarkRing& edge0,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0.has_opposite_back(0)))
+        {
+          v01 = edge0.back(2);
+          v02 = edge0.back(1);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+
+      __forceinline void init_corner(const CatmullClarkRing& edge0,
+                                     Vertex& v00,       const Vertex& v01, const Vertex& v02, 
+                                     const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                     const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1);
+        const bool has_back0 = edge0.has_opposite_back(0);
+        const bool has_front1 = edge0.has_opposite_front(1);
+        const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2);
+        
+        if (likely(has_back0)) {
+          if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); }
+          else { assert(!has_back1); v00 = 2.0f*v01-v02; }
+        }
+        else {
+          if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; }
+          else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+        }
+      }
+      
+      void init(const CatmullClarkPatch& patch)
+      {
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = patch.ring[0].vtx;
+        const Vertex v12 = v[1][2] = patch.ring[1].vtx;
+        const Vertex v22 = v[2][2] = patch.ring[2].vtx; 
+        const Vertex v21 = v[2][1] = patch.ring[3].vtx; 
+        
+        /* fill border vertices */
+        init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      void init_border(const HalfEdge* edge0, const char* vertices, size_t stride,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0->hasOpposite())) 
+        {
+          const HalfEdge* e = edge0->opposite()->next()->next(); 
+          v01 = load(e,vertices,stride); 
+          v02 = load(e->next(),vertices,stride);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+      
+      void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride,
+                       Vertex& v00, const Vertex& v01, const Vertex& v02, 
+                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool has_back0 = edge0->hasOpposite();
+        const bool has_front1 = edge0->prev()->hasOpposite();
+
+        if (likely(has_back0))
+        { 
+          const HalfEdge* e = edge0->opposite()->next();
+          if (likely(has_front1))
+          {
+            assert(e->hasOpposite());
+            assert(edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = load(e->opposite()->prev(),vertices,stride);
+          } 
+          else {
+            assert(!e->hasOpposite());
+            v00 = 2.0f*v01-v02;
+          }
+        }
+        else
+        {
+          if (likely(has_front1)) {
+            assert(!edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = 2.0f*v10-v20;
+          }
+          else {
+            assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight));
+            v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+          }
+        }
+      }
+      
+      void init(const HalfEdge* edge0, const char* vertices, size_t stride)
+      {
+        assert( edge0->isRegularFace() );
+        
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next();
+        const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next();
+        const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next();
+        const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0  == edge3->next());
+        
+        /* fill border vertices */
+        init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      __forceinline BBox<Vertex> bounds() const
+      {
+        const Vertex* const cv = &v[0][0];
+        BBox<Vertex> bounds (cv[0]);
+        for (size_t i=1; i<16 ; i++)
+          bounds.extend( cv[i] );
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_du(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_dv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative2(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative2(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex normal(const float uu, const float vv) const
+      {
+        const Vertex tu = eval_du(uu,vv);
+        const Vertex tv = eval_dv(uu,vv);
+        return cross(tu,tv);
+      }   
+
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv, const Vec4<T>& u_n, const Vec4<T>& v_n) const
+      {
+        const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x))));
+        const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x))));
+        const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x))));
+        const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x))));
+        const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+                  
+        const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y))));
+        const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y))));
+        const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y))));
+        const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y))));
+        const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y)));
+          
+        const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z))));
+        const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z))));
+        const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z))));
+        const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z))));
+        const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z)));
+        
+        return Vec3<T>(x,y,z);
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu);
+        const Vec4<T> v_n = BSplineBasis::eval(vv);
+        return eval(uu,vv,u_n,v_n);
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_du(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval_dv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudu(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative2(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dvdv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative2(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> normal(const T& uu, const T& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+      void eval(const float u, const float v, 
+                Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, 
+                const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+        const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i]))));
+        const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i]))));
+        const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+        return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+      }
+        
+      template<typename vbool, typename vfloat>
+      void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, 
+                const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BSplineBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative2(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative2(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+      friend __forceinline embree_ostream operator<<(embree_ostream o, const BSplinePatchT& p)
+      {
+        for (size_t y=0; y<4; y++)
+          for (size_t x=0; x<4; x++)
+            o << "[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+        return o;
+      } 
+
+    public:
+      Vertex v[4][4];
+    };
+  
+  typedef BSplinePatchT<Vec3fa,Vec3fa_t> BSplinePatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h
new file mode 100644
index 0000000000..05031cf6b9
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+
+namespace embree
+{
+  static const size_t MAX_PATCH_VALENCE = 16;         //!< maximum number of vertices of a patch
+  static const size_t MAX_RING_FACE_VALENCE = 64;     //!< maximum number of faces per ring
+  static const size_t MAX_RING_EDGE_VALENCE = 2*64;   //!< maximum number of edges per ring
+
+  class CatmullClarkPrecomputedCoefficients 
+  {
+  private:
+    
+    float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1];
+
+    float* table_limittangent_a[MAX_RING_FACE_VALENCE+1];
+    float* table_limittangent_b[MAX_RING_FACE_VALENCE+1];
+    float table_limittangent_c[MAX_RING_FACE_VALENCE+1];
+
+    __forceinline float set_cos_2PI_div_n(const size_t n) { 
+      if (unlikely(n == 0)) return 1.0f;
+      return cosf(2.0f*float(pi)/(float)n); 
+    }
+
+    __forceinline float set_limittangent_a(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); 
+      return cosf(2.0f*float(pi)*(float)i/(float)n) * c1;
+    }
+
+    __forceinline float set_limittangent_b(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0;
+    }
+
+    __forceinline float set_limittangent_c(const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n)));
+    }
+
+  public:
+
+    __forceinline float cos_2PI_div_n(const size_t n)
+    {
+      if (likely(n <= MAX_RING_FACE_VALENCE))
+        return table_cos_2PI_div_n[n];
+      else
+        return set_cos_2PI_div_n(n);
+    }
+
+    __forceinline float limittangent_a(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_a[n][i];
+    }
+
+    __forceinline float limittangent_b(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_b[n][i];
+    }
+
+    __forceinline float limittangent_c(const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      return table_limittangent_c[n];
+    }
+
+    static CatmullClarkPrecomputedCoefficients table;
+ 
+    CatmullClarkPrecomputedCoefficients();    
+    ~CatmullClarkPrecomputedCoefficients();    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h
new file mode 100644
index 0000000000..ab1d63594a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h
@@ -0,0 +1,562 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_ring.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) CatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef typename CatmullClark1Ring::Type Type;
+    
+    array_t<CatmullClark1RingT<Vertex,Vertex_t>,4> ring;
+    
+    public:
+    __forceinline CatmullClarkPatchT () {}
+
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) {
+      init(first_half_edge,vertices,stride);
+    }
+    
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+    
+    __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) 
+    {
+      for (unsigned i=0; i<4; i++)
+        ring[i].init(first_half_edge+i,vertices,stride);
+
+      assert(verify());
+    }
+
+    __forceinline size_t bytes() const {
+      return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes();
+    }
+
+    __forceinline void serialize(void* ptr, size_t& ofs) const
+    {
+      for (size_t i=0; i<4; i++)
+        ring[i].serialize((char*)ptr,ofs);
+    }
+
+    __forceinline void deserialize(void* ptr)
+    {
+      size_t ofs = 0;
+      for (size_t i=0; i<4; i++)
+        ring[i].deserialize((char*)ptr,ofs);
+    }
+
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds (ring[0].bounds());
+      for (size_t i=1; i<4; i++)
+	bounds.extend(ring[i].bounds());
+      return bounds;
+    }
+    
+    __forceinline Type type() const 
+    {
+      const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES);
+    }
+    
+    __forceinline bool isFinalResolution(float res) const {
+      return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res);
+    }
+    
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      dest.vertex_level = 0.0f;
+      dest.face_valence = 4;
+      dest.edge_valence = 8;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (size_t i=0; i<8; i++) 
+	dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8];
+      for (size_t i=0; i<4; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      dest.eval_start_index = (8-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatchT,4>& patch) const
+    {
+      ring[0].subdivide(patch[0].ring[0]);
+      ring[1].subdivide(patch[1].ring[1]);
+      ring[2].subdivide(patch[2].ring[2]);
+      ring[3].subdivide(patch[3].ring[3]);
+      
+      patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2;
+      if (likely(regular0))
+        init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      else
+        init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      
+      const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2;
+      if (likely(regular1))
+        init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      else
+        init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      
+      const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2;
+      if (likely(regular2))
+        init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      else
+        init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      
+      const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2;
+      if (likely(regular3))
+        init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      else
+        init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      
+      Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f;
+
+      Vertex_t center_ring[8];
+      center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0];
+      center_ring[7] = (Vertex_t)patch[3].ring[3].vtx;
+      center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0];
+      center_ring[5] = (Vertex_t)patch[2].ring[2].vtx;
+      center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0];
+      center_ring[3] = (Vertex_t)patch[1].ring[1].vtx;
+      center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0];
+      center_ring[1] = (Vertex_t)patch[0].ring[0].vtx;
+      
+      init_regular(center,center_ring,0,patch[0].ring[2]);
+      init_regular(center,center_ring,2,patch[1].ring[3]);
+      init_regular(center,center_ring,4,patch[2].ring[0]);
+      init_regular(center,center_ring,6,patch[3].ring[1]);
+      
+      assert(patch[0].verify());
+      assert(patch[1].verify());
+      assert(patch[2].verify());
+      assert(patch[3].verify());
+    }
+    
+    bool verify() const {
+      return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions();
+    }
+    
+    __forceinline void init( FinalQuad& quad ) const
+    {
+      quad.vtx[0] = (Vertex_t)ring[0].vtx;
+      quad.vtx[1] = (Vertex_t)ring[1].vtx;
+      quad.vtx[2] = (Vertex_t)ring[2].vtx;
+      quad.vtx[3] = (Vertex_t)ring[3].vtx;
+    };
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p)
+    {
+      o << "CatmullClarkPatch { " << embree_endl;
+      for (size_t i=0; i<4; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef CatmullClarkPatchT<Vec3fa,Vec3fa_t> CatmullClarkPatch3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) GeneralCatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+    static const unsigned SIZE = MAX_PATCH_VALENCE;
+    DynamicStackArray<GeneralCatmullClark1RingT<Vertex,Vertex_t>,8,SIZE> ring;
+    unsigned N;
+    
+    __forceinline GeneralCatmullClarkPatchT () 
+    : N(0) {}
+    
+    GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) {
+      init(h,vertices,stride);
+    }
+
+    __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+
+    __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) 
+    {
+      unsigned int i = 0;
+      const HalfEdge* edge = h; 
+      do {
+        ring[i].init(edge,vertices,stride);
+        edge = edge->next();
+        i++;
+      } while ((edge != h) && (i < SIZE));
+      N = i;
+    }
+
+    __forceinline unsigned size() const { 
+      return N; 
+    }
+    
+    __forceinline bool isQuadPatch() const {
+      return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads;
+    }
+
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }      
+    }
+    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const array_t<Vertex_t,2*SIZE>& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      assert(N<(MAX_RING_FACE_VALENCE));
+      assert(2*N<(MAX_RING_EDGE_VALENCE));
+      dest.vertex_level = vertex_level;
+      dest.face_valence = N;
+      dest.edge_valence = 2*N;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (unsigned i=0; i<2*N; i++) {
+        dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)];
+        assert(isvalid(dest.ring[i]));
+      }
+      for (unsigned i=0; i<N; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      assert(offset <= 2*N);
+      dest.eval_start_index = (2*N-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatch,SIZE>& patch, unsigned& N_o) const
+    {
+      N_o = N;
+      assert( N );
+      for (unsigned i=0; i<N; i++) {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        ring[i].subdivide(patch[i].ring[0]);
+        patch[i]  .ring[0].edge_level = 0.5f*ring[i].edge_level;
+        patch[ip1].ring[3].edge_level = 0.5f*ring[i].edge_level;
+        
+	assert( patch[i].ring[0].hasValidPositions() );
+        
+      }
+      assert(N < 2*SIZE);
+      Vertex_t center = Vertex_t(0.0f);
+      array_t<Vertex_t,2*SIZE> center_ring;
+      float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads
+      
+      for (unsigned i=0; i<N; i++)
+      {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        unsigned im1 = (i+N-1)%N; // FIXME: %
+        bool regular = ring[i].has_last_face() && ring[ip1].face_valence > 2;
+        if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); 
+        else                 init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]);
+        
+	assert( patch[i].ring[1].hasValidPositions() );
+	assert( patch[ip1].ring[3].hasValidPositions() );
+        
+	float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level);
+        patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level;
+	center_vertex_level = max(center_vertex_level,level);
+        
+        center += ring[i].vtx;
+        center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx;
+        center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0];
+      }
+      center /= float(N);
+      
+      for (unsigned int i=0; i<N; i++) {
+        init_regular(center,center_ring,center_vertex_level,N,2*i,patch[i].ring[2]);
+        
+	assert( patch[i].ring[2].hasValidPositions() );
+      }
+    }
+    
+    void init(CatmullClarkPatch& patch) const
+    {
+      assert(size() == 4);
+      ring[0].convert(patch.ring[0]);
+      ring[1].convert(patch.ring[1]);
+      ring[2].convert(patch.ring[2]);
+      ring[3].convert(patch.ring[3]);
+    }
+    
+    static void fix_quad_ring_order (array_t<CatmullClarkPatch,GeneralCatmullClarkPatchT::SIZE>& patches)
+    {
+      CatmullClark1Ring patches1ring1 = patches[1].ring[1];
+      patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments
+      patches[1].ring[0] = patches[1].ring[3];
+      patches[1].ring[3] = patches[1].ring[2];
+      patches[1].ring[2] = patches1ring1;
+      
+      CatmullClark1Ring patches2ring2 = patches[2].ring[2];
+      patches[2].ring[2] = patches[2].ring[0];
+      patches[2].ring[0] = patches2ring2;
+      CatmullClark1Ring patches2ring3 = patches[2].ring[3];
+      patches[2].ring[3] = patches[2].ring[1];
+      patches[2].ring[1] = patches2ring3;
+      
+      CatmullClark1Ring patches3ring3 = patches[3].ring[3];
+      patches[3].ring[3] = patches[3].ring[0];
+      patches[3].ring[0] = patches[3].ring[1];
+      patches[3].ring[1] = patches[3].ring[2];
+      patches[3].ring[2] = patches3ring3;
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const
+    {
+      Vertex P0 = ring[0].getLimitVertex();
+      for (unsigned i=0; i<N; i++)
+      {
+        const unsigned i0 = i, i1 = i+1==N ? 0 : i+1;
+        const Vertex P1 = madd(1.0f/3.0f,ring[i0].getLimitTangent(),P0);
+        const Vertex P3 = ring[i1].getLimitVertex();
+        const Vertex P2 = madd(1.0f/3.0f,ring[i1].getSecondLimitTangent(),P3);
+        new (&curves[i]) BezierCurve(P0,P1,P2,P3);
+        P0 = P3;
+      }
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[2], const unsigned subPatch) const
+    {
+      const unsigned i0 = subPatch;
+      const Vertex t0_p = ring[i0].getLimitTangent();
+      const Vertex t0_m = ring[i0].getSecondLimitTangent();
+          
+      const unsigned i1 = subPatch+1 == N ? 0 : subPatch+1;
+      const Vertex t1_p = ring[i1].getLimitTangent();
+      const Vertex t1_m = ring[i1].getSecondLimitTangent();
+      
+      const unsigned i2 = subPatch == 0 ? N-1 : subPatch-1;
+      const Vertex t2_p = ring[i2].getLimitTangent();
+      const Vertex t2_m = ring[i2].getSecondLimitTangent();
+      
+      const Vertex b00 = ring[i0].getLimitVertex();
+      const Vertex b03 = ring[i1].getLimitVertex();
+      const Vertex b33 = ring[i2].getLimitVertex();
+      
+      const Vertex b01 = madd(1.0/3.0f,t0_p,b00);
+      const Vertex b11 = madd(1.0/3.0f,t0_m,b00);
+      
+      //const Vertex b13 = madd(1.0/3.0f,t1_p,b03);
+      const Vertex b02 = madd(1.0/3.0f,t1_m,b03);
+          
+      const Vertex b22 = madd(1.0/3.0f,t2_p,b33);
+      const Vertex b23 = madd(1.0/3.0f,t2_m,b33);
+          
+      new (&curves[0]) BezierCurve(b00,b01,b02,b03);
+      new (&curves[1]) BezierCurve(b33,b22,b11,b00);
+    }
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClarkPatchT &p)
+    {
+      o << "GeneralCatmullClarkPatch { " << embree_endl;
+      for (unsigned i=0; i<p.N; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef GeneralCatmullClarkPatchT<Vec3fa,Vec3fa_t> GeneralCatmullClarkPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h
new file mode 100644
index 0000000000..73b41fd4ff
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h
@@ -0,0 +1,826 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/buffer.h"
+#include "half_edge.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  struct __aligned(64) FinalQuad {
+    Vec3fa vtx[4];
+  };
+
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) CatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    int border_index;                                   //!< edge index where border starts
+    unsigned int face_valence;                          //!< number of adjacent quad faces
+    unsigned int edge_valence;                          //!< number of adjacent edges (2*face_valence)
+    float vertex_crease_weight;                         //!< weight of vertex crease (0 if no vertex crease)
+    DynamicStackArray<float,16,MAX_RING_FACE_VALENCE> crease_weight; //!< edge crease weights for each adjacent edge
+    float vertex_level;                                 //!< maximum level of all adjacent edges
+    float edge_level;                                   //!< level of first edge
+    unsigned int eval_start_index;                      //!< topology dependent index to start evaluation
+    unsigned int eval_unique_identifier;                //!< topology dependent unique identifier for this ring 
+    Vertex vtx;                                         //!< center vertex
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring;  //!< ring of neighboring vertices
+   
+  public:
+    CatmullClark1RingT () 
+    : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty
+
+    /*! calculates number of bytes required to serialize this structure */
+    __forceinline size_t bytes() const
+    {
+      size_t ofs = 0;
+      ofs += sizeof(border_index);
+      ofs += sizeof(face_valence);
+      assert(2*face_valence == edge_valence);
+      ofs += sizeof(vertex_crease_weight);
+      ofs += face_valence*sizeof(float);
+      ofs += sizeof(vertex_level);
+      ofs += sizeof(edge_level);
+      ofs += sizeof(eval_start_index);
+      ofs += sizeof(eval_unique_identifier);
+      ofs += sizeof(vtx);
+      ofs += edge_valence*sizeof(Vertex);
+      return ofs;
+    }
+
+    template<typename Ty>
+    static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) {
+      *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty);
+    }
+
+    template<typename Ty>
+    static __forceinline void load(char* ptr, size_t& ofs, Ty& v) {
+      v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty);
+    }
+
+    /*! serializes the ring to some memory location */
+    __forceinline void serialize(char* ptr, size_t& ofs) const
+    {
+      store(ptr,ofs,border_index);
+      store(ptr,ofs,face_valence);
+      store(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        store(ptr,ofs,crease_weight[i]);
+      store(ptr,ofs,vertex_level);
+      store(ptr,ofs,edge_level);
+      store(ptr,ofs,eval_start_index);
+      store(ptr,ofs,eval_unique_identifier);
+      Vertex_t::storeu(&ptr[ofs],vtx); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        Vertex_t::storeu(&ptr[ofs],ring[i]); ofs += sizeof(Vertex);
+      }
+    }
+
+    /*! deserializes the ring from some memory location */
+    __forceinline void deserialize(char* ptr, size_t& ofs)
+    {
+      load(ptr,ofs,border_index);
+      load(ptr,ofs,face_valence);
+      edge_valence = 2*face_valence;
+      load(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        load(ptr,ofs,crease_weight[i]);
+      load(ptr,ofs,vertex_level);
+      load(ptr,ofs,edge_level);
+      load(ptr,ofs,eval_start_index);
+      load(ptr,ofs,eval_unique_identifier);
+      vtx = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        ring[i] = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      }
+    }
+
+    __forceinline bool hasBorder() const {
+      return border_index != -1;
+    }
+    
+    __forceinline const Vertex& front(size_t i) const {
+      assert(edge_valence>i);
+      return ring[i];
+    }
+    
+    __forceinline const Vertex& back(size_t i) const {
+      assert(edge_valence>=i);
+      return ring[edge_valence-i];
+    }
+    
+    __forceinline bool has_last_face() const {
+      return (size_t)border_index != (size_t)edge_valence-2;
+    }
+
+    __forceinline bool has_opposite_front(size_t i) const {
+      return (size_t)border_index != 2*i;
+    }
+
+    __forceinline bool has_opposite_back(size_t i) const {
+      return (size_t)border_index != ((size_t)edge_valence-2-2*i);
+    }
+    
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds ( vtx );
+      for (size_t i = 0; i<edge_valence ; i++)
+	bounds.extend( ring[i] );
+      return bounds;
+    }
+
+    /*! initializes the ring from the half edge structure */
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) 
+    {
+      border_index = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      
+      HalfEdge* p = (HalfEdge*) h;
+
+      unsigned i=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+
+      do
+      {
+        vertex_level = max(vertex_level,p->edge_level);
+        crease_weight[i/2] = p->edge_crease_weight;
+        assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+
+        /* store first two vertices of face */
+        p = p->next();
+        const unsigned index0 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+        if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+        p = p->next();
+
+        const unsigned index1 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index1*stride);
+        p = p->next();
+       
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          const unsigned index0 = p->getStartVertexIndex();
+          if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_index = i;
+          crease_weight[i/2] = inf; 
+          ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+          ring[i++] = vtx; // dummy vertex
+          	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+
+      } while (p != h); 
+
+      edge_valence = i;
+      face_valence = i >> 1;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_index = min_vertex_index_face;
+
+      assert( hasValidPositions() );
+    }
+      
+    __forceinline void subdivide(CatmullClark1RingT& dest) const
+    {
+      dest.edge_level             = 0.5f*edge_level;
+      dest.vertex_level           = 0.5f*vertex_level;
+      dest.face_valence           = face_valence;
+      dest.edge_valence           = edge_valence;
+      dest.border_index           = border_index;
+      dest.vertex_crease_weight   = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index       = eval_start_index;
+      dest.eval_unique_identifier = eval_unique_identifier;
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t i=0; i<face_valence; i++) 
+      {
+        size_t face_index = i + eval_start_index; if (face_index >= face_valence) face_index -= face_valence; assert(face_index < face_valence);
+        size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence);
+        size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence);
+        size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence);
+        S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t face_index = i + eval_start_index;
+        if (face_index >= face_valence) face_index -= face_valence;
+        const float edge_crease = crease_weight[face_index];
+        dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f);
+      
+        size_t index      = 2*face_index;
+        size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1;
+        size_t next_index = 2*face_index+1;
+
+        const Vertex_t v = vtx + ring[index];
+        const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index];
+        S += ring[index];
+                
+        /* fast path for regular edge points */
+        if (likely(edge_crease <= 0.0f)) {
+          dest.ring[index] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          crease_id[num_creases++] = face_index;
+          dest.ring[index] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(edge_crease < 1.0f)) {
+            dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease);
+          }
+        }
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      /* no edge crease rule and dart rule */
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) 
+      {
+        /* update vertex using crease rule */
+        const size_t crease0 = crease_id[0], crease1 = crease_id[1];
+        const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f);
+        dest.vtx = v_sharp;
+
+        /* update crease_weights using chaikin rule */
+        const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1];
+        dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+
+        /* interpolate between sharp and smooth rule */
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_smooth,v_sharp,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+    
+    __forceinline bool isRegular1() const 
+    {
+      if (border_index == -1) {
+	if (face_valence == 4) return true;
+      } else {
+	if (face_valence < 4) return true;
+      }
+      return false;
+    }
+
+    __forceinline size_t numEdgeCreases() const
+    {
+      ssize_t numCreases = 0;
+      for (size_t i=0; i<face_valence; i++) {
+        numCreases += crease_weight[i] > 0.0f;
+      }
+      return numCreases;
+    }
+
+    enum Type {
+      TYPE_NONE            = 0,      //!< invalid type
+      TYPE_REGULAR         = 1,      //!< regular patch when ignoring creases
+      TYPE_REGULAR_CREASES = 2,      //!< regular patch when considering creases
+      TYPE_GREGORY         = 4,      //!< gregory patch when ignoring creases
+      TYPE_GREGORY_CREASES = 8,      //!< gregory patch when considering creases
+      TYPE_CREASES         = 16      //!< patch has crease features
+    };
+    
+    __forceinline Type type() const
+    {
+      /* check if there is an edge crease anywhere */      
+      const size_t numCreases = numEdgeCreases();
+      const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0;
+
+      Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY);
+      if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES);
+      if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES);
+
+      /* calculate if this vertex is regular */
+      bool hasBorder = border_index != -1;
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else                                         return TYPE_CREASES;
+      }
+      else if (vertex_crease_weight != 0.0f)         return TYPE_CREASES;
+      else if (face_valence == 3 &&  hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else if (face_valence == 4 && !hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else                                           return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+    }
+
+    __forceinline bool isFinalResolution(float res) const {
+      return vertex_level <= res;
+    }
+
+    /* computes the limit vertex */
+    __forceinline Vertex getLimitVertex() const
+    {
+      /* return hard corner */ 
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+	const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f;
+      }
+      
+      Vertex_t F( 0.0f );
+      Vertex_t E( 0.0f );
+      
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++) {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        F += ring[2*index+1];
+        E += ring[2*index];
+      }
+
+      const float n = (float)face_valence;
+      return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[0] - vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {	
+	if (border_index != (int)edge_valence-2 ) {
+	  return ring[0] - vtx; 
+	}
+	else
+	{
+	  const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	  return (ring[second_border_index] - ring[border_index]) * 0.5f;
+	}
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+      
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      Vertex_t q( 0.0f );
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n);
+	alpha +=  a * ring[2*index];
+	beta  +=  b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma * (alpha + beta);
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[2] - vtx;
+ 
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+        if (border_index != 2) {
+          return ring[2] - vtx;
+        }
+        else {
+          const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+          return (ring[border_index] - ring[second_border_index]) * 0.5f;
+        }
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+
+        size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n);
+	alpha += a * ring[2*index];
+	beta  += b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma* (alpha + beta);      
+    }
+
+    /* gets surface normal */
+    const Vertex getNormal() const  {
+      return cross(getLimitTangent(),getSecondLimitTangent());
+    }
+    
+    /* returns center of the n-th quad in the 1-ring */
+    __forceinline Vertex getQuadCenter(const size_t index) const
+    {
+      const Vertex_t &p0 = vtx;
+      const Vertex_t &p1 = ring[2*index+0];
+      const Vertex_t &p2 = ring[2*index+1];
+      const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2];
+      const Vertex p = (p0+p1+p2+p3) * 0.25f;
+      return p;
+    }
+    
+    /* returns center of the n-th edge in the 1-ring */
+    __forceinline Vertex getEdgeCenter(const size_t index) const {
+      return (vtx + ring[index*2]) * 0.5f;
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", " << 
+	"hard_edge = " << c.border_index << ", face_valence " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", eval_start_index: " << c.eval_start_index << ", ring: " << embree_endl;
+      
+      for (unsigned int i=0; i<min(c.edge_valence,(unsigned int)MAX_RING_FACE_VALENCE); i++) {
+        o << i << " -> " << c.ring[i];
+        if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2];
+        o << embree_endl;
+      }
+      return o;
+    } 
+  };
+
+  typedef CatmullClark1RingT<Vec3fa,Vec3fa_t> CatmullClark1Ring3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) GeneralCatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    
+    struct Face 
+    {
+      __forceinline Face() {}
+      __forceinline Face (int size, float crease_weight)
+        : size(size), crease_weight(crease_weight) {}
+
+      // FIXME: add member that returns total number of vertices
+
+      int size;              // number of vertices-2 of nth face in ring
+      float crease_weight;
+    };
+
+    Vertex vtx;
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; 
+    DynamicStackArray<Face,16,MAX_RING_FACE_VALENCE> faces;
+    unsigned int face_valence;
+    unsigned int edge_valence;
+    int border_face;
+    float vertex_crease_weight;
+    float vertex_level;                      //!< maximum level of adjacent edges
+    float edge_level;                        // level of first edge
+    bool only_quads;                         // true if all faces are quads
+    unsigned int eval_start_face_index;
+    unsigned int eval_start_vertex_index;
+    unsigned int eval_unique_identifier;
+
+  public:
+    GeneralCatmullClark1RingT() 
+      : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {}
+
+    __forceinline bool isRegular() const 
+    {
+      if (border_face == -1 && face_valence == 4) return true;
+      return false;
+    }
+    
+    __forceinline bool has_last_face() const {
+      return border_face != (int)face_valence-1;
+    }
+    
+    __forceinline bool has_second_face() const {
+      return (border_face == -1) || (border_face >= 2);
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride)
+    {
+      only_quads = true;
+      border_face = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      HalfEdge* p = (HalfEdge*) h;
+      
+      unsigned int e=0, f=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      unsigned min_vertex_index_vertex = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+      do 
+      {
+        HalfEdge* p_prev = p->prev();
+        HalfEdge* p_next = p->next();
+        const float crease_weight = p->edge_crease_weight;
+         assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+        vertex_level = max(vertex_level,p->edge_level);
+
+        /* find minimum start vertex */
+        unsigned vertex_index = p_next->getStartVertexIndex();
+        if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+	/* store first N-2 vertices of face */
+	unsigned int vn = 0;
+        for (p = p_next; p!=p_prev; p=p->next()) {
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          vn++;
+	}
+	faces[f++] = Face(vn,crease_weight);
+	only_quads &= (vn == 2);
+	
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          unsigned vertex_index = p->getStartVertexIndex();
+          if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_face = f;
+	  faces[f++] = Face(2,inf); 
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          ring[e++] = vtx; // dummy vertex
+	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+	
+      } while (p != h); 
+      
+      edge_valence = e;
+      face_valence = f;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_face_index = min_vertex_index_face;
+      eval_start_vertex_index = min_vertex_index_vertex;
+
+      assert( hasValidPositions() );
+    }
+    
+    __forceinline void subdivide(CatmullClark1Ring& dest) const
+    {
+      dest.edge_level = 0.5f*edge_level;
+      dest.vertex_level = 0.5f*vertex_level;
+      dest.face_valence = face_valence;
+      dest.edge_valence = 2*face_valence;
+      dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME:
+      dest.vertex_crease_weight    = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index        = eval_start_face_index;
+      dest.eval_unique_identifier  = eval_unique_identifier;
+      assert(dest.face_valence <= MAX_RING_FACE_VALENCE);
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t face=0, v=eval_start_vertex_index; face<face_valence; face++) {
+        size_t f = (face + eval_start_face_index)%face_valence;
+
+        Vertex_t F = vtx;
+        for (size_t k=v; k<=v+faces[f].size; k++) F += ring[k%edge_valence]; // FIXME: optimize
+        S += dest.ring[2*f+1] = F/float(faces[f].size+2);
+        v+=faces[f].size;
+        v%=edge_valence;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+      Vertex_t C = Vertex_t(0.0f);
+      for (size_t face=0, j=eval_start_vertex_index; face<face_valence; face++)
+      {
+        size_t i = (face + eval_start_face_index)%face_valence;
+        
+        const Vertex_t v = vtx + ring[j];
+        Vertex_t f = dest.ring[2*i+1];
+        if (i == 0) f += dest.ring[dest.edge_valence-1]; 
+        else        f += dest.ring[2*i-1];
+        S += ring[j];
+        dest.crease_weight[i] = max(faces[i].crease_weight-1.0f,0.0f);
+        
+        /* fast path for regular edge points */
+        if (likely(faces[i].crease_weight <= 0.0f)) {
+          dest.ring[2*i] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          C += ring[j]; crease_id[num_creases++] = i;
+          dest.ring[2*i] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(faces[i].crease_weight < 1.0f)) {
+            dest.ring[2*i] = lerp((v+f)*0.25f,v*0.5f,faces[i].crease_weight);
+          }
+        }
+        j+=faces[i].size;
+        j%=edge_valence;
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) {
+        const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f);
+        const float crease_weight0 = faces[crease_id[0]].crease_weight;
+        const float crease_weight1 = faces[crease_id[1]].crease_weight;
+        dest.vtx = v_sharp;
+        dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_sharp,v_smooth,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+
+    void convert(CatmullClark1Ring& dst) const
+    {
+      dst.edge_level = edge_level;
+      dst.vertex_level = vertex_level;
+      dst.vtx = vtx;
+      dst.face_valence = face_valence;
+      dst.edge_valence = 2*face_valence;
+      dst.border_index = border_face == -1 ? -1 : 2*border_face;
+      for (size_t i=0; i<face_valence; i++) 
+	dst.crease_weight[i] = faces[i].crease_weight;
+      dst.vertex_crease_weight = vertex_crease_weight;
+      for (size_t i=0; i<edge_valence; i++) dst.ring[i] = ring[i];
+
+      dst.eval_start_index = eval_start_face_index;
+      dst.eval_unique_identifier = eval_unique_identifier;
+
+      assert( dst.hasValidPositions() );
+    }
+
+
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getLimitTangent();
+    }
+
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getSecondLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getSecondLimitTangent();
+    }
+
+
+    /* gets limit vertex */
+    __forceinline Vertex getLimitVertex() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+        convert(cc_vtx);
+      else 
+        subdivide(cc_vtx);
+      return cc_vtx.getLimitVertex();
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl;
+      for (size_t v=0, f=0; f<c.face_valence; v+=c.faces[f++].size) {
+        for (size_t i=v; i<v+c.faces[f].size; i++) {
+          o << i << " -> " << c.ring[i];
+          if (i == v) o << " crease = " << c.faces[f].crease_weight;
+          o << embree_endl;
+        }
+      }
+      return o;
+    } 
+  };  
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h
new file mode 100644
index 0000000000..b244af481c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h
@@ -0,0 +1,296 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+/*
+
+  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
+  the curve goes through p2 with tangent (p3-p2)/2.
+
+ */
+
+namespace embree
+{
+  class CatmullRomBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = - t * s * s;
+      const T n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+      const T n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+      const T n3 = - s * t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 =  - s * s + 2.0f * s * t;
+      const T n1 =  2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+      const T n2 =  2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+      const T n3 = -2.0f * s * t + t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T n0 = -3.0f * t + 2.0f;
+      const T n1 =  9.0f * t - 5.0f;
+      const T n2 = -9.0f * t + 4.0f;
+      const T n3 =  3.0f * t - 1.0f;
+      return Vec4<T>(n0, n1, n2, n3);
+    }
+  };
+  
+  struct PrecomputedCatmullRomBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedCatmullRomBasis() {}
+    PrecomputedCatmullRomBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedCatmullRomBasis catmullrom_basis0;
+  extern PrecomputedCatmullRomBasis catmullrom_basis1;
+
+  template<typename Vertex>
+    struct CatmullRomCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CatmullRomCurveT() {}
+      
+      __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+
+      __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) {
+        return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline CatmullRomCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,v3-p), v3.w);
+        return CatmullRomCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval(t);
+        dp = veval_du(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of catmull-rom curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) {
+        return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
+  {
+    return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef CatmullRomCurveT<Vec3fa> CatmullRomCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h
new file mode 100644
index 0000000000..23f24c360c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h
@@ -0,0 +1,226 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+        
+      public:
+        
+        FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval(patch,Vec2f(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          eval(patch,Vec2f(u,v),dscale,depth);
+        }
+        
+        void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE>& patches, const Vec2f& uv, size_t depth)
+        {
+          float u = uv.x, v = uv.y;
+          if (v < 0.5f) {
+            if (u < 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,0);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdx; *dPdv = dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,1);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdy; *dPdv = dpdx;
+              }
+            }
+          } else {
+            if (u > 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,2);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdx; *dPdv = -dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,3);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdy; *dPdv = -dpdx;
+              }
+            }
+          }
+        }
+
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth;
+//#else
+          return depth>=(size_t)max_eval_depth;
+//#endif
+        }
+        
+        void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, 
+                  BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          while (true) 
+          {
+            typename CatmullClarkPatch::Type ty = patch.type();
+
+            if (unlikely(final(patch,ty,depth)))
+            {
+              if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+                RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(234423,c,c,-1);
+                return;
+              } else {
+                IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(34534,c,-1,c);
+                return;
+              }
+            }
+            else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+              assert(depth > 0); 
+              RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(43524,c,c,-1);
+              return;
+            }
+#if PATCH_USE_GREGORY == 2
+            else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+              assert(depth > 0); 
+              GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(23498,c,-1,c);
+              return;
+            }
+#endif
+            else
+            {
+              array_t<CatmullClarkPatch,4> patches; 
+              patch.subdivide(patches); // FIXME: only have to generate one of the patches
+              
+              const float u = uv.x, v = uv.y;
+              if (v < 0.5f) {
+                if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; }
+                else          { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; }
+              } else {
+                if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; }
+                else          { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; }
+              }
+              depth++;
+            }
+          }
+        }
+        
+        void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) 
+        {  
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) 
+          {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval(qpatch,uv,1.0f,depth); 
+          }
+          
+          /* subdivide patch */
+          unsigned N;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,N); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (N == 4) 
+            eval_general_quad(patch,patches,uv,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const unsigned i = 4*h+l; assert(i<N);
+            if (i >= N) return;
+
+#if PATCH_USE_GREGORY == 2
+            BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+            BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+            BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1);
+#endif
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h
new file mode 100644
index 0000000000..76583b2e5d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h
@@ -0,0 +1,359 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "catmullclark_patch.h"
+#include "bspline_patch.h"
+#include "gregory_patch.h"
+#include "tessellation.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct FeatureAdaptiveEvalGrid
+    {
+      typedef CatmullClark1Ring3fa CatmullClarkRing;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth;
+      //const unsigned dheight;
+      unsigned count;
+      
+
+    public:      
+      FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1));
+        
+        /* convert into standard quad patch if possible */
+        if (likely(patch.isQuadPatch())) 
+        {
+          CatmullClarkPatch3fa qpatch; patch.init(qpatch);
+          eval(qpatch, srange, erange, 0);
+          assert(count == (x1-x0+1)*(y1-y0+1));
+          return;
+        }
+        
+        /* subdivide patch */
+        unsigned N;
+        array_t<CatmullClarkPatch3fa,GeneralCatmullClarkPatch3fa::SIZE> patches; 
+        patch.subdivide(patches,N);
+        
+        if (N == 4)
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r);
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r);
+          eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr);
+          eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr);
+          eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l);
+#else
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1);
+          eval(patches[1],srange1,intersect(srange1,erange),1);
+          eval(patches[2],srange2,intersect(srange2,erange),1);
+          eval(patches[3],srange3,intersect(srange3,erange),1);
+#endif
+        }
+        else
+        {
+          assert(subPatch < N);
+          
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r);
+          eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r);
+#else
+          eval(patches[subPatch], srange, erange, 1);
+#endif
+          
+        }
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+      
+      FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch,
+                               const BBox2f& srange, const BBox2f& erange, const unsigned depth,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        eval(patch,srange,erange,depth);
+      }
+
+      template<typename Patch>
+      void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+      
+      __forceinline bool final(const CatmullClarkPatch3fa& patch, const CatmullClarkRing::Type type, unsigned depth) 
+      {
+        const unsigned max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//        return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+        return depth>=max_eval_depth;
+//#endif
+      }
+      
+      void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, 
+                const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr)
+      {
+        if (erange.empty())
+          return;
+        
+        int lx0 = (int) ceilf(erange.lower.x);
+        int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        int ly0 = (int) ceilf(erange.lower.y);
+        int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) return;
+
+        CatmullClarkPatch::Type ty = patch.type();
+
+        if (unlikely(final(patch,ty,depth)))
+        {
+          if (ty & CatmullClarkRing::TYPE_REGULAR) {
+            RegularPatch rpatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          } else {
+            IrregularFillPatch ipatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          }
+        }
+        else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+          assert(depth > 0); 
+          RegularPatch rpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+          return;
+        }
+#if PATCH_USE_GREGORY == 2
+        else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+          assert(depth > 0); 
+          GregoryPatch gpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1);
+        }
+#endif
+        else
+        {
+          array_t<CatmullClarkPatch3fa,4> patches; 
+          patch.subdivide(patches);
+          
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          eval(patches[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patches[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patches[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patches[3],srange3,intersect(srange3,erange),depth+1);
+        }
+      }
+    };
+    
+    template<typename Eval, typename Patch>
+      bool stitch_col(const Patch& patch, int subPatch,
+                      const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_y <= fine_y);
+      if (likely(fine_y == coarse_y))
+        return false;
+      
+      const unsigned y0s = stitch(y0,fine_y,coarse_y);
+      const unsigned y1s = stitch(y1,fine_y,coarse_y);
+      const unsigned M = y1s-y0s+1 + VSIZEX;
+      
+      dynamic_large_stack_array(float,px,M,64*sizeof(float));
+      dynamic_large_stack_array(float,py,M,64*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,64*sizeof(float));
+      dynamic_large_stack_array(float,u,M,64*sizeof(float));
+      dynamic_large_stack_array(float,v,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,64*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,64*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097);
+      
+      for (unsigned y=y0; y<=y1; y++) 
+      {
+        const unsigned ys = stitch(y,fine_y,coarse_y)-y0s;
+        Px[(y-y0)*dwidth+dx0] = px[ys];
+        Py[(y-y0)*dwidth+dx0] = py[ys];
+        Pz[(y-y0)*dwidth+dx0] = pz[ys];
+        U [(y-y0)*dwidth+dx0] = u[ys];
+        V [(y-y0)*dwidth+dx0] = v[ys];
+        if (unlikely(has_Nxyz)) {
+          Nx[(y-y0)*dwidth+dx0] = nx[ys];
+          Ny[(y-y0)*dwidth+dx0] = ny[ys];
+          Nz[(y-y0)*dwidth+dx0] = nz[ys];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+      bool stitch_row(const Patch& patch, int subPatch, 
+                      const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_x <= fine_x);
+      if (likely(fine_x == coarse_x))
+	return false;
+      
+      const unsigned x0s = stitch(x0,fine_x,coarse_x);
+      const unsigned x1s = stitch(x1,fine_x,coarse_x);
+      const unsigned M = x1s-x0s+1 + VSIZEX;
+
+      dynamic_large_stack_array(float,px,M,32*sizeof(float));
+      dynamic_large_stack_array(float,py,M,32*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,32*sizeof(float));
+      dynamic_large_stack_array(float,u,M,32*sizeof(float));
+      dynamic_large_stack_array(float,v,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,32*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,32*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1);
+      
+      for (unsigned x=x0; x<=x1; x++) 
+      {
+	const unsigned xs = stitch(x,fine_x,coarse_x)-x0s;
+	Px[dy0*dwidth+x-x0] = px[xs];
+        Py[dy0*dwidth+x-x0] = py[xs];
+        Pz[dy0*dwidth+x-x0] = pz[xs];
+        U [dy0*dwidth+x-x0] = u[xs];
+        V [dy0*dwidth+x-x0] = v[xs];
+        if (unlikely(has_Nxyz)) {
+          Nx[dy0*dwidth+x-x0] = nx[xs];
+          Ny[dy0*dwidth+x-x0] = ny[xs];
+          Nz[dy0*dwidth+x-x0] = nz[xs];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+    void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4],
+                                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                                     float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight)
+    {
+      bool sl = false, sr = false, st = false, sb = false;
+      if (levels) {
+        sl = x0 == 0         && stitch_col<Eval,Patch>(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sr = x1 == swidth-1  && stitch_col<Eval,Patch>(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight);
+        st = y0 == 0         && stitch_row<Eval,Patch>(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sb = y1 == sheight-1 && stitch_row<Eval,Patch>(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight);
+      }
+      const unsigned ofs = st*dwidth+sl;
+      Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight);
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h
new file mode 100644
index 0000000000..fa3216730f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+
+        FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT<Vertex,Vertex_t>(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval_direct(valid,patch,Vec2<vfloat>(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          eval_direct(valid,patch,Vec2<vfloat>(u,v),dscale,depth);
+        }
+
+        template<size_t N>
+        __forceinline void eval_quad_direct(const vbool& valid, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+        }
+        
+        template<size_t N>
+        __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+#if PATCH_USE_GREGORY == 2
+          BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+#endif
+          GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+#if PATCH_USE_GREGORY == 2
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+#endif
+        }
+        
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+          return depth>=max_eval_depth;
+//#endif
+        }
+
+        void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2<vfloat>& uv, float dscale, size_t depth,
+                         BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          typename CatmullClarkPatch::Type ty = patch.type();
+
+          if (unlikely(final(patch,ty,depth)))
+          {
+            if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+              RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            } else {
+              IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            }
+          }
+          else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+            assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#if PATCH_USE_GREGORY == 2
+          else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+            assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#endif
+          else
+          {
+            array_t<CatmullClarkPatch,4> patches; 
+            patch.subdivide(patches); // FIXME: only have to generate one of the patches
+            eval_quad_direct(valid,patches,uv,dscale,depth);
+          }
+        }  
+
+        void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2<vfloat>& uv, const size_t depth) 
+        {
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval_direct(valid,qpatch,uv,1.0f,depth);
+          }
+          
+          /* subdivide patch */
+          unsigned Nc;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (Nc == 4) 
+            eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const vint i = (h<<2)+l; assert(all(valid,i<Nc));
+            foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+#if PATCH_USE_GREGORY == 2
+                BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+                BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+                BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1);
+#endif
+              });
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h
new file mode 100644
index 0000000000..2a7c4b1f2c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h
@@ -0,0 +1,893 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_patch.h"
+#include "bezier_curve.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{  
+  template<typename Vertex, typename Vertex_t = Vertex>
+  class __aligned(64) GregoryPatchT
+  {
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+  public:
+    Vertex v[4][4];
+    Vertex f[2][2];
+
+    __forceinline GregoryPatchT() {}
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch) {
+      init(patch);
+    }
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch, 
+                                const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+    {
+      init_crackfix(patch,border0,border1,border2,border3);
+    }
+
+    __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { 
+      init(CatmullClarkPatch(edge,vertices,stride));
+    }
+      
+    __forceinline Vertex& p0() { return v[0][0]; }
+    __forceinline Vertex& p1() { return v[0][3]; }
+    __forceinline Vertex& p2() { return v[3][3]; }
+    __forceinline Vertex& p3() { return v[3][0]; }
+    
+    __forceinline Vertex& e0_p() { return v[0][1]; }
+    __forceinline Vertex& e0_m() { return v[1][0]; }
+    __forceinline Vertex& e1_p() { return v[1][3]; }
+    __forceinline Vertex& e1_m() { return v[0][2]; }
+    __forceinline Vertex& e2_p() { return v[3][2]; }
+    __forceinline Vertex& e2_m() { return v[2][3]; }
+    __forceinline Vertex& e3_p() { return v[2][0]; }
+    __forceinline Vertex& e3_m() { return v[3][1]; }
+    
+    __forceinline Vertex& f0_p() { return v[1][1]; }
+    __forceinline Vertex& f1_p() { return v[1][2]; }
+    __forceinline Vertex& f2_p() { return v[2][2]; }
+    __forceinline Vertex& f3_p() { return v[2][1]; }
+    __forceinline Vertex& f0_m() { return f[0][0]; }
+    __forceinline Vertex& f1_m() { return f[0][1]; }
+    __forceinline Vertex& f2_m() { return f[1][1]; }
+    __forceinline Vertex& f3_m() { return f[1][0]; }
+    
+    __forceinline const Vertex& p0() const { return v[0][0]; }
+    __forceinline const Vertex& p1() const { return v[0][3]; }
+    __forceinline const Vertex& p2() const { return v[3][3]; }
+    __forceinline const Vertex& p3() const { return v[3][0]; }
+    
+    __forceinline const Vertex& e0_p() const { return v[0][1]; }
+    __forceinline const Vertex& e0_m() const { return v[1][0]; }
+    __forceinline const Vertex& e1_p() const { return v[1][3]; }
+    __forceinline const Vertex& e1_m() const { return v[0][2]; }
+    __forceinline const Vertex& e2_p() const { return v[3][2]; }
+    __forceinline const Vertex& e2_m() const { return v[2][3]; }
+    __forceinline const Vertex& e3_p() const { return v[2][0]; }
+    __forceinline const Vertex& e3_m() const { return v[3][1]; }
+    
+    __forceinline const Vertex& f0_p() const { return v[1][1]; }
+    __forceinline const Vertex& f1_p() const { return v[1][2]; }
+    __forceinline const Vertex& f2_p() const { return v[2][2]; }
+    __forceinline const Vertex& f3_p() const { return v[2][1]; }
+    __forceinline const Vertex& f0_m() const { return f[0][0]; }
+    __forceinline const Vertex& f1_m() const { return f[0][1]; }
+    __forceinline const Vertex& f2_m() const { return f[1][1]; }
+    __forceinline const Vertex& f3_m() const { return f[1][0]; }
+    
+    __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) {
+      return irreg_patch.ring[index].getLimitVertex();
+    }
+    
+    __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx);
+    }
+
+    __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx);
+    }
+    
+    void initFaceVertex(const CatmullClarkPatch& irreg_patch, 
+			const size_t index, 
+			const Vertex& p_vtx, 
+                        const Vertex& e0_p_vtx, 
+			const Vertex& e1_m_vtx, 
+			const unsigned int face_valence_p1,
+ 			const Vertex& e0_m_vtx,	
+			const Vertex& e3_p_vtx,	
+			const unsigned int face_valence_p3,
+			Vertex& f_p_vtx, 
+			Vertex& f_m_vtx)
+    {
+      const unsigned int face_valence = irreg_patch.ring[index].face_valence;
+      const unsigned int edge_valence = irreg_patch.ring[index].edge_valence;
+      const unsigned int border_index = irreg_patch.ring[index].border_index;
+      
+      const Vertex& vtx     = irreg_patch.ring[index].vtx;
+      const Vertex e_i      = irreg_patch.ring[index].getEdgeCenter(0);
+      const Vertex c_i_m_1  = irreg_patch.ring[index].getQuadCenter(0);
+      const Vertex e_i_m_1  = irreg_patch.ring[index].getEdgeCenter(1);
+      
+      Vertex c_i, e_i_p_1;
+      const bool hasHardEdge0 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[0]);
+                
+      if (unlikely((border_index == edge_valence-2) || hasHardEdge0))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i     = madd(2.0f, e_i - c_i_m_1, c_i_m_1);
+        e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1);
+      }
+      else
+      {
+        c_i     = irreg_patch.ring[index].getQuadCenter( face_valence-1 );
+        e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 );
+      }
+      
+      Vertex c_i_m_2, e_i_m_2;
+      const bool hasHardEdge1 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[1]);
+      
+      if (unlikely(border_index == 2 || hasHardEdge1))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i_m_2  = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1);
+        e_i_m_2  = madd(2.0f, vtx - e_i, + e_i);
+      }
+      else
+      {
+        c_i_m_2  = irreg_patch.ring[index].getQuadCenter( 1 );
+        e_i_m_2  = irreg_patch.ring[index].getEdgeCenter( 2 );
+      }      
+      
+      const float d = 3.0f;
+      //const float c     = cosf(2.0f*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3);
+      
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+      const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i);
+      const Vertex r_e_m = 1.0f/3.0f * (e_i     - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2);
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);     
+    }
+
+    __noinline void init(const CatmullClarkPatch& patch)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+
+    }
+
+    __noinline void init_crackfix(const CatmullClarkPatch& patch, 
+                                  const BezierCurve* border0, 
+                                  const BezierCurve* border1,
+                                  const BezierCurve* border2, 
+                                  const BezierCurve* border3)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      if (unlikely(border0 != nullptr)) 
+      {         
+        p0()   = border0->v0;
+        e0_p() = border0->v1; 
+        e1_m() = border0->v2; 
+        p1()   = border0->v3;
+      }
+      
+      if (unlikely(border1 != nullptr))
+      {          
+        p1()   = border1->v0; 
+        e1_p() = border1->v1; 
+        e2_m() = border1->v2; 
+        p2()   = border1->v3; 
+      }
+
+      if (unlikely(border2 != nullptr))
+      {          
+        p2()   = border2->v0; 
+        e2_p() = border2->v1; 
+        e3_m() = border2->v2; 
+        p3()   = border2->v3; 
+      }
+
+      if (unlikely(border3 != nullptr))
+      {          
+        p3()   = border3->v0; 
+        e3_p() = border3->v1; 
+        e0_m() = border3->v2; 
+        p0()   = border3->v3; 
+      }
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+    }
+
+    
+    void computeGregoryPatchFacePoints(const unsigned int face_valence,
+				       const Vertex& r_e_p, 
+				       const Vertex& r_e_m, 					 
+				       const Vertex& p_vtx, 
+				       const Vertex& e0_p_vtx, 
+				       const Vertex& e1_m_vtx, 
+				       const unsigned int face_valence_p1,
+				       const Vertex& e0_m_vtx,	
+				       const Vertex& e3_p_vtx,	
+				       const unsigned int face_valence_p3,
+				       Vertex& f_p_vtx, 
+				       Vertex& f_m_vtx,
+                                       const float d = 3.0f)
+    {
+      //const float c     = cosf(2.0*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3);
+
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);      
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);
+    }
+
+    __noinline void init(const GeneralCatmullClarkPatch& patch)
+    {
+      assert(patch.size() == 4);
+#if 0
+      CatmullClarkPatch qpatch; patch.init(qpatch);
+      init(qpatch);
+#else
+      const float face_valence_p0 = patch.ring[0].face_valence;
+      const float face_valence_p1 = patch.ring[1].face_valence;
+      const float face_valence_p2 = patch.ring[2].face_valence;
+      const float face_valence_p3 = patch.ring[3].face_valence;
+
+      Vertex p0_r_p, p0_r_m;
+      patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m );
+
+      Vertex p1_r_p, p1_r_m;
+      patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m );
+      
+      Vertex p2_r_p, p2_r_m;
+      patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m );
+
+      Vertex p3_r_p, p3_r_m;
+      patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m );
+
+      computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() );
+      computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() );
+      computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() );
+      computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() );
+
+#endif
+    }
+   
+    
+    __forceinline void convert_to_bezier()
+    {
+      f0_p() = (f0_p() + f0_m()) * 0.5f;
+      f1_p() = (f1_p() + f1_m()) * 0.5f;
+      f2_p() = (f2_p() + f2_m()) * 0.5f;
+      f3_p() = (f3_p() + f3_m()) * 0.5f;
+      f0_m() = Vertex( zero );
+      f1_m() = Vertex( zero );
+      f2_m() = Vertex( zero );
+      f3_m() = Vertex( zero );      
+    }
+    
+    static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv,
+						   Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21)
+    {
+      if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) 
+      {
+	matrix_11 = matrix[1][1];
+	matrix_12 = matrix[1][2];
+	matrix_22 = matrix[2][2];
+	matrix_21 = matrix[2][1];	 
+      }
+      else
+      {
+	const Vertex_t f0_p = matrix[1][1];
+	const Vertex_t f1_p = matrix[1][2];
+	const Vertex_t f2_p = matrix[2][2];
+	const Vertex_t f3_p = matrix[2][1];
+        
+	const Vertex_t f0_m = f_m[0][0];
+	const Vertex_t f1_m = f_m[0][1];
+	const Vertex_t f2_m = f_m[1][1];
+	const Vertex_t f3_m = f_m[1][0];
+        
+	matrix_11 = (      uu  * f0_p +       vv  * f0_m)*rcp(uu+vv);
+	matrix_12 = ((1.0f-uu) * f1_m +       vv  * f1_p)*rcp(1.0f-uu+vv);
+	matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv);
+	matrix_21 = (      uu  * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv);
+      }
+    } 
+
+    template<typename vfloat>
+    static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], 
+                                                   size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) 
+    {
+      const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const vfloat f0_p = v[1][1][i];
+      const vfloat f1_p = v[1][2][i];
+      const vfloat f2_p = v[2][2][i];
+      const vfloat f3_p = v[2][1][i];
+      
+      const vfloat f0_m = f[0][0][i];
+      const vfloat f1_m = f[0][1][i];
+      const vfloat f2_m = f[1][1][i];
+      const vfloat f3_m = f[1][0][i];
+      
+      const vfloat one_minus_uu = vfloat(1.0f) - uu;
+      const vfloat one_minus_vv = vfloat(1.0f) - vv;      
+      
+      const vfloat f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const vfloat f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const vfloat f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      matrix_11 = select(m_border,f0_p,f0_i);
+      matrix_12 = select(m_border,f1_p,f1_i);
+      matrix_22 = select(m_border,f2_p,f2_i);
+      matrix_21 = select(m_border,f3_p,f3_i);
+    }
+
+    static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) 
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+      
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative2(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+     }
+
+    static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative2(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    __forceinline Vertex eval(const float uu, const float vv) const {
+      return eval(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_du( const float uu, const float vv) const {
+      return eval_du(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dv( const float uu, const float vv) const {
+      return eval_dv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudu( const float uu, const float vv) const {
+      return eval_dudu(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dvdv( const float uu, const float vv) const {
+      return eval_dvdv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudv( const float uu, const float vv) const {
+      return eval_dudv(v,f,uu,vv);
+    }
+
+    static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv)  // FIXME: why not using basis functions
+    {
+      /* interpolate inner vertices */
+      Vertex_t matrix_11, matrix_12, matrix_22, matrix_21;
+      computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21);
+      
+      /* tangentU */
+      const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]);
+      const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11   , (Vertex_t)matrix_21   , (Vertex_t)matrix[3][1]);
+      const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12   , (Vertex_t)matrix_22   , (Vertex_t)matrix[3][2]);
+      const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]);
+      const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11   , (Vertex_t)matrix_12   , (Vertex_t)matrix[1][3]);
+      const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21   , (Vertex_t)matrix_22   , (Vertex_t)matrix[2][3]);
+      const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vertex_t n = cross(tangentU,tangentV);
+      
+      return n;     
+    }
+   
+    __forceinline Vertex normal( const float uu, const float vv) const {
+      return normal(v,f,uu,vv);
+    }    
+    
+    __forceinline void eval(const float u, const float v, 
+                            Vertex* P, Vertex* dPdu, Vertex* dPdv, 
+                            Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+    static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                     const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n,
+                                     vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21)
+    {
+      const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+      const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i]))));
+      const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i]))));
+      const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+      return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+    }
+    
+    template<typename vbool, typename vfloat>
+    static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                   const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                                   float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                                   const float dscale, const size_t dstride, const size_t N) 
+    {
+      if (P) {
+        const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+        const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+        for (size_t i=0; i<N; i++) {
+          vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+          computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times
+          vfloat::store(valid,P+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21));
+        }
+      }
+      if (dPdu)
+      {
+        {
+          assert(dPdu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+        {
+          assert(dPdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+      }
+      if (ddPdudu)
+      {
+        {
+          assert(ddPdudu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdvdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative2(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdvdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdudv);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                            float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                            const float dscale, const size_t dstride, const size_t N) const {
+      eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> eval_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      const Vec3<T> F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+
+      const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), 
+                  madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x          ,madd(B2_u,F1.x          ,B3_u * matrix[1][3].x))), 
+                  madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x          ,madd(B2_u,F2.x          ,B3_u * matrix[2][3].x))), 
+                       B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); 
+
+      const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y          ,madd(B2_u,F1.y          ,B3_u * matrix[1][3].y))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y          ,madd(B2_u,F2.y          ,B3_u * matrix[2][3].y))),
+                       B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y))))));
+      
+      const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z          ,madd(B2_u,F1.z          ,B3_u * matrix[1][3].z))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z          ,madd(B2_u,F2.z          ,B3_u * matrix[2][3].z))),
+                       B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z))))));
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<class T>
+    __forceinline Vec3<T> eval(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return eval_t(v,ff,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+
+#if 1
+      const M m_corner0 = (uu == 0.0f) & (vv == 0.0f);
+      const M m_corner1 = (uu == 1.0f) & (vv == 0.0f);
+      const M m_corner2 = (uu == 1.0f) & (vv == 1.0f);
+      const M m_corner3 = (uu == 0.0f) & (vv == 1.0f);      
+      const Vec3<T> matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) );
+#else
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+      const Vec3<T> matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+#endif
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+      
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+      
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+      
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+     template<class T>
+    __forceinline Vec3<T> normal(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return normal_t(v,ff,uu,vv);
+    }
+
+    __forceinline BBox<Vertex> bounds() const
+    {
+      const Vertex *const cv = &v[0][0];
+      BBox<Vertex> bounds (cv[0]);
+      for (size_t i=1; i<16; i++) 
+        bounds.extend( cv[i] );
+      bounds.extend(f[0][0]);
+      bounds.extend(f[1][0]);
+      bounds.extend(f[1][1]);
+      bounds.extend(f[1][1]);
+      return bounds;
+    }
+    
+    friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+      
+      for (size_t y=0; y<2; y++)
+	for (size_t x=0; x<2; x++)
+	  o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl;
+      return o;
+    } 
+  };
+
+  typedef GregoryPatchT<Vec3fa,Vec3fa_t> GregoryPatch3fa;
+
+  template<typename Vertex, typename Vertex_t>
+    __forceinline  BezierPatchT<Vertex,Vertex_t>::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) 
+  {
+    CatmullClarkPatchT<Vertex,Vertex_t> patch(edge,vertices,stride);
+    GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+    gpatch.convert_to_bezier(); 
+    for (size_t y=0; y<4; y++)
+      for (size_t x=0; x<4; x++)
+        matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+  }
+  
+   template<typename Vertex, typename Vertex_t>
+    __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+
+   template<typename Vertex, typename Vertex_t>
+     __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, 
+                                                               const BezierCurveT<Vertex>* border0,
+                                                               const BezierCurveT<Vertex>* border1,
+                                                               const BezierCurveT<Vertex>* border2,
+                                                               const BezierCurveT<Vertex>* border3) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch,border0,border1,border2,border3); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h
new file mode 100644
index 0000000000..85effd02cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h
@@ -0,0 +1,113 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gregory_patch.h"
+
+namespace embree
+{  
+  class __aligned(64) DenseGregoryPatch3fa
+  {
+    typedef Vec3fa Vec3fa_4x4[4][4];
+  public:
+
+    __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f);
+      
+      matrix[0][0].w = patch.f[0][0].x;
+      matrix[0][1].w = patch.f[0][0].y;
+      matrix[0][2].w = patch.f[0][0].z;
+      matrix[0][3].w = 0.0f;
+      
+      matrix[1][0].w = patch.f[0][1].x;
+      matrix[1][1].w = patch.f[0][1].y;
+      matrix[1][2].w = patch.f[0][1].z;
+      matrix[1][3].w = 0.0f;
+      
+      matrix[2][0].w = patch.f[1][1].x;
+      matrix[2][1].w = patch.f[1][1].y;
+      matrix[2][2].w = patch.f[1][1].z;
+      matrix[2][3].w = 0.0f;
+      
+      matrix[3][0].w = patch.f[1][0].x;
+      matrix[3][1].w = patch.f[1][0].y;
+      matrix[3][2].w = patch.f[1][0].z;
+      matrix[3][3].w = 0.0f;
+    }
+
+    __forceinline void extract_f_m(Vec3fa f_m[2][2]) const
+    {
+      f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );      
+    }
+
+    __forceinline Vec3fa eval(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline Vec3fa normal(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    template<class T>
+      __forceinline Vec3<T> eval(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+    
+    template<class T>
+      __forceinline Vec3<T> normal(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, 
+                            Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      if (P) {
+        *P    = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+        assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const 
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N);
+    }
+
+  private:
+    Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h
new file mode 100644
index 0000000000..4fd741c879
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+namespace embree
+{
+  struct __aligned(16) GridRange
+  {
+    unsigned int u_start;
+    unsigned int u_end;
+    unsigned int v_start;
+    unsigned int v_end;
+
+    __forceinline GridRange() {}
+
+    __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) 
+      : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {}
+
+    __forceinline unsigned int width() const {
+      return u_end-u_start+1;
+    }
+
+    __forceinline unsigned int height() const {
+      return v_end-v_start+1;
+    }
+
+    __forceinline bool hasLeafSize() const
+    {
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      assert(u_size >= 1);
+      assert(v_size >= 1);
+      return u_size <= 3 && v_size <= 3;
+    }
+
+    static __forceinline unsigned int split(unsigned int start,unsigned int end)
+    {
+      const unsigned int center = (start+end)/2;
+      assert (center > start);
+      assert (center < end);
+      return center;
+    }
+
+    __forceinline void split(GridRange& r0, GridRange& r1) const
+    {
+      assert( hasLeafSize() == false );
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      r0 = *this;
+      r1 = *this;
+
+      if (u_size >= v_size)
+      {
+        const unsigned int u_mid = split(u_start,u_end);
+        r0.u_end   = u_mid;
+        r1.u_start = u_mid;
+      }
+      else
+      {
+        const unsigned int v_mid = split(v_start,v_end);
+        r0.v_end   = v_mid;
+        r1.v_start = v_mid;
+      }
+    }
+
+    __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const
+    {
+      assert( !hasLeafSize() );
+      unsigned int children = 0;
+      GridRange first,second;
+      split(first,second);
+
+      if (first.hasLeafSize()) {
+        r[0] = first;
+        children++;
+      } 
+      else {
+        first.split(r[0],r[1]);
+        children += 2;
+      }
+
+      if (second.hasLeafSize())	{
+        r[children] = second;
+        children++;
+      }
+      else {
+        second.split(r[children+0],r[children+1]);
+        children += 2;
+      }
+      return children;      
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h
new file mode 100644
index 0000000000..fb350ca71f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  class __aligned(32) HalfEdge
+  {
+    friend class SubdivMesh;
+    public:
+
+    enum PatchType : char { 
+      BILINEAR_PATCH        = 0, //!< a bilinear patch
+      REGULAR_QUAD_PATCH    = 1, //!< a regular quad patch can be represented as a B-Spline
+      IRREGULAR_QUAD_PATCH  = 2, //!< an irregular quad patch can be represented as a Gregory patch
+      COMPLEX_PATCH         = 3  //!< these patches need subdivision and cannot be processed by the above fast code paths
+    };
+    
+    enum VertexType : char { 
+      REGULAR_VERTEX           = 0, //!< regular vertex
+      NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge
+    };
+    
+    __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) {
+      return (PatchType) max((int)ty0,(int)ty1);
+    }
+    
+    struct Edge 
+    {
+      /*! edge constructor */
+      __forceinline Edge(const uint32_t v0, const uint32_t v1)
+	: v0(v0), v1(v1) {}
+
+      /*! create an 64 bit identifier that is unique for the not oriented edge */
+      __forceinline operator uint64_t() const       
+      {
+	uint32_t p0 = v0, p1 = v1;
+	if (p0<p1) std::swap(p0,p1);
+	return (((uint64_t)p0) << 32) | (uint64_t)p1;
+      }
+
+    public:
+      uint32_t v0,v1;    //!< start and end vertex of the edge
+    };
+
+    HalfEdge () 
+      : vtx_index(-1), next_half_edge_ofs(0), prev_half_edge_ofs(0), opposite_half_edge_ofs(0), edge_crease_weight(0), 
+      vertex_crease_weight(0), edge_level(0), patch_type(COMPLEX_PATCH), vertex_type(REGULAR_VERTEX)
+    {
+      static_assert(sizeof(HalfEdge) == 32, "invalid half edge size");
+    }
+ 
+    __forceinline bool hasOpposite() const { return opposite_half_edge_ofs != 0; }
+    __forceinline void setOpposite(HalfEdge* opposite) { opposite_half_edge_ofs = int(opposite-this); }
+    
+    __forceinline       HalfEdge* next()       { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    __forceinline const HalfEdge* next() const { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* prev()       { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    __forceinline const HalfEdge* prev() const { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* opposite()       { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    __forceinline const HalfEdge* opposite() const { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* rotate()       { return opposite()->next(); }
+    __forceinline const HalfEdge* rotate() const { return opposite()->next(); }
+    
+    __forceinline unsigned int getStartVertexIndex() const { return vtx_index; }
+    __forceinline unsigned int getEndVertexIndex  () const { return next()->vtx_index; }
+    __forceinline Edge         getEdge            () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); }
+   
+    
+    /*! tests if the start vertex of the edge is regular */
+    __forceinline PatchType vertexType() const
+    {
+      const HalfEdge* p = this;
+      size_t face_valence = 0;
+      bool hasBorder = false;
+      
+      do
+      {
+        /* we need subdivision to handle edge creases */
+        if (p->hasOpposite() && p->edge_crease_weight > 0.0f) 
+          return COMPLEX_PATCH;
+        
+        face_valence++;
+        
+        /* test for quad */
+        const HalfEdge* pp = p;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp != p) return COMPLEX_PATCH;
+        
+        /* continue with next face */
+        p = p->prev();
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          face_valence++;
+          hasBorder = true;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->rotate();
+        }
+      } while (p != this); 
+      
+      /* calculate vertex type */
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return REGULAR_QUAD_PATCH;
+        else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH;
+        else                                         return COMPLEX_PATCH;
+      }
+      else if (vertex_crease_weight != 0.0f)         return COMPLEX_PATCH;
+      else if (face_valence == 3 &&  hasBorder)      return REGULAR_QUAD_PATCH;
+      else if (face_valence == 4 && !hasBorder)      return REGULAR_QUAD_PATCH;
+      else                                           return IRREGULAR_QUAD_PATCH;
+    }
+
+    /*! tests if this edge is part of a bilinear patch */
+    __forceinline bool bilinearVertex() const {
+      return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf);
+    }
+    
+    /*! calculates the type of the patch */
+    __forceinline PatchType patchType() const 
+    {
+      const HalfEdge* p = this;
+      PatchType ret = REGULAR_QUAD_PATCH;
+      bool bilinear = true;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) != this) return COMPLEX_PATCH;
+      
+      if (bilinear) return BILINEAR_PATCH;
+      return ret;
+    }
+    
+    /*! tests if the face is a regular b-spline face */
+    __forceinline bool isRegularFace() const {
+      return patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the face can be diced (using bspline or gregory patch) */
+    __forceinline bool isGregoryFace() const {
+      return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the base vertex of this half edge is a corner vertex */
+    __forceinline bool isCorner() const {
+      return !hasOpposite() && !prev()->hasOpposite();
+    }
+
+    /*! tests if the vertex is attached to any border */
+    __forceinline bool vertexHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (!p->hasOpposite()) return true;
+        p = p->rotate();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! tests if the face this half edge belongs to has some border */
+    __forceinline bool faceHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (p->vertexHasBorder()) return true;
+        p = p->next();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! calculates conservative bounds of a catmull clark subdivision face */
+    __forceinline BBox3fa bounds(const BufferView<Vec3fa>& vertices) const
+    {
+      BBox3fa bounds = this->get1RingBounds(vertices);
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next())
+        bounds.extend(p->get1RingBounds(vertices));
+      return bounds;
+    }
+    
+    /*! tests if this is a valid patch */
+    __forceinline bool valid(const BufferView<Vec3fa>& vertices) const
+    {
+      size_t N = 1;
+      if (!this->validRing(vertices)) return false;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) {
+        if (!p->validRing(vertices)) return false;
+      }
+      return N >= 3 && N <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! counts number of polygon edges  */
+    __forceinline unsigned int numEdges() const
+    {
+      unsigned int N = 1;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++);
+      return N;
+    }
+
+    /*! calculates face and edge valence */
+    __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const 
+    {
+      faceValence = 0;
+      edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+         /* calculate bounds of current face */
+        unsigned int numEdges = p->numEdges();
+        assert(numEdges >= 3);
+        edgeValence += numEdges-2;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+    }
+
+    /*! stream output */
+    friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h)
+    {
+      return o << "{ " << 
+        "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << 
+        "prev = " << h.prev_half_edge_ofs << ", " << 
+        "next = " << h.next_half_edge_ofs << ", " << 
+        "opposite = " << h.opposite_half_edge_ofs << ", " << 
+        "edge_crease = " << h.edge_crease_weight << ", " << 
+        "vertex_crease = " << h.vertex_crease_weight << ", " << 
+        //"edge_level = " << h.edge_level << 
+        " }";
+    } 
+    
+  private:
+    
+    /*! calculates the bounds of the face associated with the half-edge */
+    __forceinline BBox3fa getFaceBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa b = vertices[getStartVertexIndex()];
+      for (const HalfEdge* p = next(); p!=this; p=p->next()) {
+        b.extend(vertices[p->getStartVertexIndex()]);
+      }
+      return b;
+    }
+    
+    /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */
+    __forceinline BBox3fa get1RingBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa bounds = empty;
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        bounds.extend(p->getFaceBounds(vertices));
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return bounds;
+    }
+    
+    /*! tests if this is a valid face */
+    __forceinline bool validFace(const BufferView<Vec3fa>& vertices, size_t& N) const 
+    {
+      const Vec3fa v = vertices[getStartVertexIndex()];
+      if (!isvalid(v)) return false;
+      size_t n = 1;
+      for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) {
+        const Vec3fa v = vertices[p->getStartVertexIndex()];
+        if (!isvalid(v)) return false;
+      }
+      N += n-2;
+      return n >= 3 && n <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! tests if this is a valid ring */
+    __forceinline bool validRing(const BufferView<Vec3fa>& vertices) const 
+    {
+      size_t faceValence = 0;
+      size_t edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        if (!p->validFace(vertices,edgeValence)) 
+          return false;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE;
+    }
+    
+  private:
+    unsigned int vtx_index;         //!< index of edge start vertex
+    int next_half_edge_ofs;         //!< relative offset to next half edge of face
+    int prev_half_edge_ofs;         //!< relative offset to previous half edge of face
+    int opposite_half_edge_ofs;     //!< relative offset to opposite half edge
+    
+  public:
+    float edge_crease_weight;       //!< crease weight attached to edge
+    float vertex_crease_weight;     //!< crease weight attached to start vertex
+    float edge_level;               //!< subdivision factor for edge
+    PatchType patch_type;           //!< stores type of subdiv patch
+    VertexType vertex_type;         //!< stores type of the start vertex
+    char align[2];
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h
new file mode 100644
index 0000000000..9fab79cf0c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex>
+    struct HermiteCurveT : BezierCurveT<Vertex>
+    {
+      __forceinline HermiteCurveT() {}
+
+      __forceinline HermiteCurveT(const BezierCurveT<Vertex>& curve)
+        : BezierCurveT<Vertex>(curve) {}
+      
+      __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1)
+        : BezierCurveT<Vertex>(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {}
+
+      __forceinline HermiteCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w);
+        const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w);
+        const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w);
+        const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w);
+        return BezierCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+    };
+
+  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
+    return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve)));
+  }
+  
+  typedef HermiteCurveT<Vec3fa> HermiteCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h
new file mode 100644
index 0000000000..f4a854af7f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bezier_curve.h"
+
+namespace embree
+{
+  namespace isa
+  {   
+    template<typename V>
+      struct TensorLinearQuadraticBezierSurface
+      {
+        QuadraticBezierCurve<V> L;
+        QuadraticBezierCurve<V> R;
+        
+        __forceinline TensorLinearQuadraticBezierSurface() {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<V>& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+          __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<V>& L, const QuadraticBezierCurve<V>& R)
+            : L(L), R(R) {}
+        
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+      };
+    
+    template<>
+      struct TensorLinearQuadraticBezierSurface<Vec2fa>
+    {
+      QuadraticBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearQuadraticBezierSurface() {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<Vec2fa>& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+    };
+    
+    template<typename V>
+      struct TensorLinearCubicBezierSurface
+      {
+        CubicBezierCurve<V> L;
+        CubicBezierCurve<V> R;
+        
+        __forceinline TensorLinearCubicBezierSurface() {}
+        
+        __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+        __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<V>& L, const CubicBezierCurve<V>& R)
+          : L(L), R(R) {}
+
+        template<template<typename T> class SourceCurve>
+        __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve<Vec3ff>& center, const SourceCurve<Vec3fa>& normal)
+        {
+          SourceCurve<Vec3ff> vcurve = center;
+          SourceCurve<Vec3fa> ncurve = normal;
+          
+          /* here we construct a patch which follows the curve l(t) =
+           * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
+          
+          const Vec3ff p0   = vcurve.eval(0.0f);
+          const Vec3ff dp0  = vcurve.eval_du(0.0f);
+          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+
+          const Vec3fa n0   = ncurve.eval(0.0f);
+          const Vec3fa dn0  = ncurve.eval_du(0.0f);
+
+          const Vec3ff p1   = vcurve.eval(1.0f);
+          const Vec3ff dp1  = vcurve.eval_du(1.0f);
+          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+
+          const Vec3fa n1   = ncurve.eval(1.0f);
+          const Vec3fa dn1  = ncurve.eval_du(1.0f);
+
+          const Vec3fa bt0  = cross(n0,dp0);
+          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+
+          const Vec3fa bt1  = cross(n1,dp1);
+          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+            
+          const Vec3fa k0  = normalize(bt0);
+          const Vec3fa dk0 = dnormalize(bt0,dbt0);
+          
+          const Vec3fa k1 = normalize(bt1);
+          const Vec3fa dk1 = dnormalize(bt1,dbt1);
+                    
+          const Vec3fa l0 = p0 - p0.w*k0;
+          const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa r0 = p0 + p0.w*k0;
+          const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa l1 = p1 - p1.w*k1;
+          const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1);
+
+          const Vec3fa r1 = p1 + p1.w*k1;
+          const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1);
+
+          const float scale = 1.0f/3.0f;
+          CubicBezierCurve<V> L(l0,l0+scale*dl0,l1-scale*dl1,l1);
+          CubicBezierCurve<V> R(r0,r0+scale*dr0,r1-scale*dr1,r1);
+          return TensorLinearCubicBezierSurface(L,R);
+        }
+
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+
+        __forceinline BBox3fa accurateBounds() const {
+          return merge(L.accurateBounds(),R.accurateBounds());
+        }
+        
+        __forceinline CubicBezierCurve<Interval1f> reduce_v() const {
+          return merge(CubicBezierCurve<Interval<V>>(L),CubicBezierCurve<Interval<V>>(R));
+        }
+        
+        __forceinline LinearBezierCurve<Interval1f> reduce_u() const {
+          return LinearBezierCurve<Interval1f>(L.bounds(),R.bounds());
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx,p),R.xfm(dx,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+          return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const {
+          return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper)));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+          return clip_v(v).clip_u(u);
+        }
+        
+        __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+        {
+          CubicBezierCurve<V> L0,L1; L.split(L0,L1,u);
+          CubicBezierCurve<V> R0,R1; R.split(R0,R1,u);
+          new (&left ) TensorLinearCubicBezierSurface(L0,R0);
+          new (&right) TensorLinearCubicBezierSurface(L1,R1);
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+          valid = true; clear(valid,VSIZEX-1);
+          return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u));
+        }
+        
+        __forceinline V eval(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval(u);
+        }
+        
+        __forceinline V eval_du(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval_dt(u);
+        }
+        
+        __forceinline V eval_dv(const float u, const float v) const {
+          return (R-L).eval(u);
+        }
+        
+        __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const
+        {
+          V p0, dp0du; L.eval(u,p0,dp0du);
+          V p1, dp1du; R.eval(u,p1,dp1du);
+          p = lerp(p0,p1,v);
+          dpdu = lerp(dp0du,dp1du,v);
+          dpdv = p1-p0;
+        }
+        
+        __forceinline TensorLinearQuadraticBezierSurface<V> derivative_u() const {
+          return TensorLinearQuadraticBezierSurface<V>(L.derivative(),R.derivative());
+        }
+        
+        __forceinline CubicBezierCurve<V> derivative_v() const {
+          return R-L;
+        }
+        
+        __forceinline V axis_u() const {
+          return (L.end()-L.begin())+(R.end()-R.begin());
+        }
+        
+        __forceinline V axis_v() const {
+          return (R.begin()-L.begin())+(R.end()-L.end());
+        }
+        
+        friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+        {
+          return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                      << "{" << embree_endl
+                      << "  L = " << a.L << ", " << embree_endl
+                      << "  R = " << a.R << embree_endl
+                      << "}";
+        }
+
+        friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) {
+          return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t)));
+        }
+      };
+    
+    template<>
+      struct TensorLinearCubicBezierSurface<Vec2fa>
+    {
+      CubicBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearCubicBezierSurface() {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<Vec2fa>& L, const CubicBezierCurve<Vec2fa>& R)
+        : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {}
+      
+      __forceinline CubicBezierCurve<Vec2fa> getL() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3));
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> getR() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3)));
+      }
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+      
+      __forceinline BBox1f bounds(const Vec2fa& axis) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy);
+        const BBox<vfloat4> Lb = LRa.bounds();
+        const BBox<vfloat4> Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper));
+        const BBox<vfloat4> b = merge(Lb,Rb);
+        return BBox1f(b.lower[0],b.upper[0]);
+      }
+
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx, const Vec2fa& p) const
+      {
+        const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p));
+        const CubicBezierCurve<vfloat4> LRx = LR-pxyxy;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+
+      __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+        return TensorLinearCubicBezierSurface(LR.clip(u));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const
+      {
+        const CubicBezierCurve<vfloat4> LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3));
+        const CubicBezierCurve<vfloat4> RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3));
+        return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper)));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+        return clip_v(v).clip_u(u);
+      }
+      
+      __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+      {
+        CubicBezierCurve<vfloat4> LR0,LR1; LR.split(LR0,LR1,u);
+        new (&left ) TensorLinearCubicBezierSurface(LR0);
+        new (&right) TensorLinearCubicBezierSurface(LR1);
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+        valid = true; clear(valid,VSIZEX-1);
+        return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u));
+      }
+      
+      __forceinline Vec2fa eval(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v));
+      }
+      
+      __forceinline Vec2fa eval_du(const float u, const float v) const
+      {
+        const vfloat4 dpdu = LR.eval_dt(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v));
+      }
+      
+      __forceinline Vec2fa eval_dv(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p));
+      }
+      
+      __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const
+      {
+        vfloat4 p0, dp0du; LR.eval(u,p0,dp0du);
+        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v));
+        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v));
+        dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0));
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const {
+        return TensorLinearQuadraticBezierSurface<Vec2fa>(LR.derivative());
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> derivative_v() const {
+        return getR()-getL();
+      }
+      
+      __forceinline Vec2fa axis_u() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (L.end()-L.begin())+(R.end()-R.begin());
+      }
+      
+      __forceinline Vec2fa axis_v() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (R.begin()-L.begin())+(R.end()-L.end());
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+      {
+        return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                    << "{" << embree_endl
+                    << "  L = " << a.getL() << ", " << embree_endl
+                    << "  R = " << a.getR() << embree_endl
+                    << "}";
+      }
+    };
+
+    typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f;
+    typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa;
+    typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch.h b/thirdparty/embree-aarch64/kernels/subdiv/patch.h
new file mode 100644
index 0000000000..d58241b96d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bilinear_patch.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "tessellation_cache.h"
+
+#if 1
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)
+#else
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)            \
+  {                                                   \
+    size_t hex = (size_t)ptr;                          \
+    for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8);  \
+    const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \
+    if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f);         \
+    }               
+#endif
+
+#define PATCH_MAX_CACHE_DEPTH 2
+//#define PATCH_MIN_RESOLUTION 1     // FIXME: not yet completely implemented
+#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10     // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_MAX_EVAL_DEPTH_CREASE 10       // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_USE_GREGORY 1        // 0 = no gregory, 1 = fill, 2 = as early as possible
+
+#if PATCH_USE_GREGORY==2
+#define PATCH_USE_BEZIER_PATCH 1   // enable use of bezier instead of b-spline patches
+#else
+#define PATCH_USE_BEZIER_PATCH 0   // enable use of bezier instead of b-spline patches
+#endif
+
+#if PATCH_USE_BEZIER_PATCH
+#  define RegularPatch  BezierPatch
+#  define RegularPatchT BezierPatchT<Vertex,Vertex_t>
+#else
+#  define RegularPatch  BSplinePatch
+#  define RegularPatchT BSplinePatchT<Vertex,Vertex_t>
+#endif
+
+#if PATCH_USE_GREGORY
+#define IrregularFillPatch GregoryPatch
+#define IrregularFillPatchT GregoryPatchT<Vertex,Vertex_t>
+#else
+#define IrregularFillPatch BilinearPatch
+#define IrregularFillPatchT BilinearPatchT<Vertex,Vertex_t>
+#endif
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) PatchT
+    {
+    public:
+    
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+    typedef BezierCurveT<Vertex> BezierCurve;
+    
+    enum Type {
+      INVALID_PATCH = 0,
+      BILINEAR_PATCH = 1,
+      BSPLINE_PATCH = 2,  
+      BEZIER_PATCH = 3,  
+      GREGORY_PATCH = 4,
+      SUBDIVIDED_GENERAL_PATCH = 7,
+      SUBDIVIDED_QUAD_PATCH = 8,
+      EVAL_PATCH = 9,
+    };
+    
+    struct Ref
+    {
+      __forceinline Ref(void* p = nullptr) 
+        : ptr((size_t)p) {}
+
+      __forceinline operator bool() const { return ptr != 0; }
+      __forceinline operator size_t() const { return ptr; }
+
+      __forceinline Ref (Type ty, void* in) 
+        : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); }
+
+      __forceinline Type  type  () const { return (Type)(ptr & 0xF); }
+      __forceinline void* object() const { return (void*) (ptr & ~0xF); }
+
+      size_t ptr;
+    };
+
+    struct EvalPatch 
+    {
+      /* creates EvalPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) 
+      {
+        size_t ofs = 0, bytes = patch.bytes();
+        void* ptr = alloc(bytes);
+        patch.serialize(ptr,ofs);
+        assert(ofs == bytes);
+        return Ref(EVAL_PATCH, ptr);
+      }
+    };
+
+    struct BilinearPatch 
+    {
+      /* creates BilinearPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch));
+      }
+
+      __forceinline BilinearPatch (const CatmullClarkPatch& patch) 
+        : patch(patch) {}
+
+      /* creates BilinearPatch from 4 vertices */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+    public:
+      BilinearPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct BSplinePatch 
+    {
+      /* creates BSplinePatch from a half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride));
+      }
+      
+      __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates BSplinePatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BSplinePatchT<Vertex,Vertex_t> patch;
+    };
+
+    struct BezierPatch
+    {
+      /* creates BezierPatch from a half edge */
+      template<typename Allocator>
+        __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates Bezier from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BezierPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct GregoryPatch
+    {
+      /* creates GregoryPatch from half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride));
+      }
+      
+      __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(CatmullClarkPatch(edge,vertices,stride)) {}
+       
+      /* creates GregoryPatch from CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      GregoryPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct SubdividedQuadPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref children[4]) {
+        return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children));
+      }
+      
+      __forceinline SubdividedQuadPatch(Ref children[4]) {
+        for (size_t i=0; i<4; i++) child[i] = children[i];
+      }
+      
+    public:
+      Ref child[4];
+    };
+    
+    struct SubdividedGeneralPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) {
+        return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N));
+      }
+      
+      __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) {
+        for (unsigned i=0; i<N; i++) child[i] = children[i];
+      }
+      
+      unsigned N;
+      Ref child[MAX_PATCH_VALENCE];
+    };
+    
+    /*! Default constructor. */
+    __forceinline PatchT () {}
+    
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride)
+    {
+      if (PATCH_MAX_CACHE_DEPTH == 0) 
+        return nullptr;
+
+      Ref child(0);
+      switch (edge->patch_type) {
+      case HalfEdge::BILINEAR_PATCH:       child = BilinearPatch::create(alloc,edge,vertices,stride); break; 
+      case HalfEdge::REGULAR_QUAD_PATCH:   child = RegularPatch::create(alloc,edge,vertices,stride); break;
+#if PATCH_USE_GREGORY == 2
+      case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break;
+#endif
+      default: {
+        GeneralCatmullClarkPatch patch(edge,vertices,stride);
+        child = PatchT::create(alloc,patch,edge,vertices,stride,0);
+      }
+      }
+      return child;
+    }
+
+    template<typename Allocator>
+    __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth)
+    {  
+      /* convert into standard quad patch if possible */
+      if (likely(patch.isQuadPatch())) 
+      {
+        CatmullClarkPatch qpatch; patch.init(qpatch);
+        return PatchT::create(alloc,qpatch,edge,vertices,stride,depth);
+      }
+   
+      /* do only cache up to some depth */
+      if (depth >= PATCH_MAX_CACHE_DEPTH)
+        return nullptr;
+         
+      /* subdivide patch */
+      unsigned N;
+      array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+      patch.subdivide(patches,N);
+      
+      if (N == 4) 
+      {
+        Ref child[4];
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+        BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+        BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+        BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+        BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r);
+        child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr);
+        child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr);
+        child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+      else 
+      {
+        assert(N<MAX_PATCH_VALENCE);
+        Ref child[MAX_PATCH_VALENCE];
+        
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; 
+        patch.getLimitBorder(borders);
+
+        for (size_t i0=0; i0<N; i0++) {
+          const size_t i2 = i0==0 ? N-1 : i0-1; 
+          BezierCurve border0l,border0r; borders[i0].subdivide(border0l,border0r);
+          BezierCurve border2l,border2r; borders[i2].subdivide(border2l,border2r);
+          child[i0] = PatchT::create(alloc,patches[i0],edge,vertices,stride,depth+1, &border0l, nullptr, nullptr, &border2r);
+        }
+#else
+        for (size_t i=0; i<N; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedGeneralPatch::create(alloc,child,N);
+      }
+      
+      return nullptr;
+    }
+
+    static __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+    {
+      const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//      return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+      return depth>=max_eval_depth;
+//#endif
+    }
+
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth,
+                                   const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr)
+    {
+      const typename CatmullClarkPatch::Type ty = patch.type();
+      if (unlikely(final(patch,ty,depth))) {
+        if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+        else                                     return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+      else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+        assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#if PATCH_USE_GREGORY == 2
+      else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+        assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#endif
+      else if (depth >= PATCH_MAX_CACHE_DEPTH) {
+        return EvalPatch::create(alloc,patch); 
+      }
+      
+      else 
+      {
+        Ref child[4];
+        array_t<CatmullClarkPatch,4> patches; 
+        patch.subdivide(patches);
+        
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+    }
+  };
+
+  typedef PatchT<Vec3fa,Vec3fa_t> Patch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h
new file mode 100644
index 0000000000..482d015fa3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        
+        PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                   const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                   Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            },true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+
+          if (patch && allAllocationsValid &&  eval(patch,u,v,1.0f,0)) {
+            SharedLazyTessellationCache::unlock();
+            return;
+          }
+          SharedLazyTessellationCache::unlock();
+          FeatureAdaptiveEval<Vertex,Vertex_t>(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+          PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1);
+        }
+        
+        __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth)
+        {
+          if (v < 0.5f) {
+            if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+            else          return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          } else {
+            if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+            else          return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          }
+        }
+        
+        bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth)
+        {
+          const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; 
+          const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; 
+          const unsigned i = 4*h+l; assert(i<This->N);
+          return eval(This->child[i],u,v,1.0f,depth+1);
+        }
+        
+        bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          //PRINT(depth);
+          //PRINT2(u,v);
+          
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            //PRINT("bilinear");
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,c);
+            return true;
+          }
+          case Patch::BSPLINE_PATCH: {
+            //PRINT("bspline");
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::BEZIER_PATCH: {
+            //PRINT("bezier");
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::GREGORY_PATCH: {
+            //PRINT("gregory");
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,-1,c);
+            return true;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            //PRINT("subdivided quad");
+            return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            //PRINT("general_patch");
+            assert(dscale == 1.0f); 
+            return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            //PRINT("eval_patch");
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEval<Vertex,Vertex_t>(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+            return true;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
+  
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h
new file mode 100644
index 0000000000..c05db55f4c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h
@@ -0,0 +1,245 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_grid.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct PatchEvalGrid
+    {
+      typedef Patch3fa Patch;
+      typedef Patch::Ref Ref;
+      typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth,dheight;
+      unsigned count;
+
+    public:      
+
+      PatchEvalGrid (Ref patch, unsigned subPatch,
+                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                     float* Px, float* Py, float* Pz, float* U, float* V, 
+                     float* Nx, float* Ny, float* Nz,
+                     const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1));
+        bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange);
+        assert(done);
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+
+      template<typename Patch>
+      __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch->patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch->patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+
+      bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) 
+      {
+        if (erange.empty())
+          return true;
+        
+        const int lx0 = (int) ceilf(erange.lower.x);
+        const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        const int ly0 = (int) ceilf(erange.lower.y);
+        const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) 
+          return true;
+
+        if (!This) 
+          return false;
+        
+        switch (This.type()) 
+        {
+        case Patch::BILINEAR_PATCH: {
+          evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BSPLINE_PATCH: {
+          evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BEZIER_PATCH: {
+          evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::GREGORY_PATCH: {
+          evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::SUBDIVIDED_QUAD_PATCH: 
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object();
+          eval(patch->child[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patch->child[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patch->child[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patch->child[3],srange3,intersect(srange3,erange),depth+1);
+          return true;
+        }
+        case Patch::EVAL_PATCH: { 
+          CatmullClarkPatch patch; patch.deserialize(This.object());
+          FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight);
+          count += (lx1-lx0)*(ly1-ly0);
+          return true;
+        }
+        default: 
+          assert(false); 
+          return false;
+        }
+      }
+
+      bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) 
+      {
+        if (!This) 
+          return false;
+
+        switch (This.type()) 
+        {
+        case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+          Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object();
+          assert(subPatch < patch->N);
+          return eval(patch->child[subPatch],srange,erange,1);
+        }
+        default: 
+          assert(subPatch == 0);
+          return eval(This,srange,erange,0);
+        }
+      }
+    };
+
+    __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h)
+    {
+      const unsigned N = h->numEdges();
+      if (N == 4) return 1;
+      else return N;
+    }
+    
+    template<typename Tessellator>
+      inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator)
+    {
+      const unsigned N = h->numEdges();
+      int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t
+      float levels[GeneralCatmullClarkPatch3fa::SIZE];
+      for (unsigned i=0; i<N; i++) {
+        assert(i<GeneralCatmullClarkPatch3fa::SIZE);
+        neighborSubdiv[i] = h->hasOpposite() ? h->opposite()->numEdges() != 4 : 0; 
+        levels[i] = h->edge_level;
+        h = h->next();
+      }      
+      if (N == 4)
+      {
+        const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) };
+        tessellator(uv,neighborSubdiv,levels,0);
+      }
+      else
+      {
+        for (unsigned i=0; i<N; i++) 
+        {
+          assert(i<MAX_PATCH_VALENCE);
+          static_assert(MAX_PATCH_VALENCE <= 16, "MAX_PATCH_VALENCE > 16");
+          const int h = (i >> 2) & 3, l = i & 3;
+          const Vec2f subPatchID((float)l,(float)h);
+          const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) };
+          const int neighborSubdiv1[4] = { 0,0,0,0 }; 
+          const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] };
+          tessellator(uv,neighborSubdiv1,levels1,i);
+        }
+      }
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h
new file mode 100644
index 0000000000..28016d9e20
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_simd.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+
+        PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                       const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, 
+                       float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            }, true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+          
+          patch = allAllocationsValid ? patch : nullptr;
+
+          /* use cached data structure for calculations */
+          const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false);
+          SharedLazyTessellationCache::unlock();
+          const vbool valid2 = valid0 & !valid1;
+          if (any(valid2)) {
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+          }
+        }
+        
+        vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth)
+        {
+          vbool ret = false;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          return ret;
+        }
+        
+        vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth)
+        {
+          vbool ret = false;
+          const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
+          const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
+          const vint i = (h<<2)+l; assert(all(valid,i<patch->N));
+          foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+              ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1);
+            });
+          return ret;
+        }
+        
+        vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::BSPLINE_PATCH: {
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::BEZIER_PATCH: {
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::GREGORY_PATCH: {
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            assert(dscale == 1.0f); 
+            return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+            return valid;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h
new file mode 100644
index 0000000000..d5bc403cca
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "gregory_patch_dense.h"
+#include "tessellation.h"
+#include "tessellation_cache.h"
+#include "gridrange.h"
+#include "patch_eval_grid.h"
+#include "feature_adaptive_eval_grid.h"
+#include "../common/scene_subdiv_mesh.h"
+
+namespace embree
+{
+  struct __aligned(64) SubdivPatch1Base
+  {
+  public:
+
+    enum Type {
+      INVALID_PATCH          = 0,
+      BSPLINE_PATCH          = 1,  
+      BEZIER_PATCH           = 2,  
+      GREGORY_PATCH          = 3,
+      EVAL_PATCH             = 5,
+      BILINEAR_PATCH         = 6,
+    };
+
+    enum Flags {
+      TRANSITION_PATCH       = 16, 
+    };
+
+    /*! Default constructor. */
+    __forceinline SubdivPatch1Base () {}
+
+    SubdivPatch1Base (const unsigned int gID,
+                      const unsigned int pID,
+                      const unsigned int subPatch,
+                      const SubdivMesh *const mesh,
+                      const size_t time,
+                      const Vec2f uv[4],
+                      const float edge_level[4],
+                      const int subdiv[4],
+                      const int simd_width);
+
+    __forceinline bool needsStitching() const {
+      return flags & TRANSITION_PATCH;      
+    }
+
+    __forceinline Vec2f getUV(const size_t i) const {
+      return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000);
+    }
+
+    static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]);
+    static Vec2i computeGridSize(const float level[4]);
+    bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width);
+
+  public:
+
+    __forceinline size_t getGridBytes() const {
+      const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4;
+      return 64*((grid_size_xyzuv+15) / 16);
+    }
+
+    __forceinline void write_lock()     { mtx.lock();   }
+    __forceinline void write_unlock()   { mtx.unlock(); }
+    __forceinline bool try_write_lock() { return mtx.try_lock(); }
+    //__forceinline bool try_read_lock()  { return mtx.try_read_lock(); }
+
+    __forceinline void resetRootRef() {
+      //assert( mtx.hasInitialState() );
+      root_ref = SharedLazyTessellationCache::Tag();
+    }
+
+    __forceinline SharedLazyTessellationCache::CacheEntry& entry() {
+      return (SharedLazyTessellationCache::CacheEntry&) root_ref;
+    }
+
+  public:    
+    __forceinline unsigned int geomID() const  {
+      return geom;
+    } 
+
+    __forceinline unsigned int primID() const  {
+      return prim;
+    } 
+
+  public:
+    SharedLazyTessellationCache::Tag root_ref;
+    SpinLock mtx;
+
+    unsigned short u[4];                        //!< 16bit discretized u,v coordinates
+    unsigned short v[4];
+    float level[4];
+
+    unsigned char flags;
+    unsigned char type;
+    unsigned short grid_u_res;
+    unsigned int geom;                          //!< geometry ID of the subdivision mesh this patch belongs to
+    unsigned int prim;                          //!< primitive ID of this subdivision patch
+    unsigned short grid_v_res;
+
+    unsigned short grid_size_simd_blocks;
+    unsigned int time_;
+
+    struct PatchHalfEdge {
+      const HalfEdge* edge;
+      unsigned subPatch;
+    };
+
+    Vec3fa patch_v[4][4];
+
+    const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; }
+    unsigned time() const { return time_; }
+    unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; }
+
+    void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; }
+    void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; }
+  };
+
+  namespace isa
+  {
+    Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv);
+    Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv);
+    
+    template<typename simdf>
+      Vec3<simdf> patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+
+    template<typename simdf>
+      Vec3<simdf> patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+   
+
+    /* eval grid over patch and stich edges when required */      
+    void evalGrid(const SubdivPatch1Base& patch,
+                  const unsigned x0, const unsigned x1,
+                  const unsigned y0, const unsigned y1,
+                  const unsigned swidth, const unsigned sheight,
+                  float *__restrict__ const grid_x,
+                  float *__restrict__ const grid_y,
+                  float *__restrict__ const grid_z,
+                  float *__restrict__ const grid_u,
+                  float *__restrict__ const grid_v,
+                  const SubdivMesh* const geom);
+
+    /* eval grid over patch and stich edges when required */      
+    BBox3fa evalGridBounds(const SubdivPatch1Base& patch,
+                           const unsigned x0, const unsigned x1,
+                           const unsigned y0, const unsigned y1,
+                           const unsigned swidth, const unsigned sheight,
+                           const SubdivMesh* const geom);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h
new file mode 100644
index 0000000000..bda1e2d559
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* adjust discret tessellation level for feature-adaptive pre-subdivision */
+  __forceinline float adjustTessellationLevel(float l, const size_t sublevel)
+  {
+    for (size_t i=0; i<sublevel; i++) l *= 0.5f;
+    float r = ceilf(l);      
+    for (size_t i=0; i<sublevel; i++) r *= 2.0f;
+    return r;
+  }
+  
+  __forceinline int stitch(const int x, const int fine, const int coarse) {
+    return (2*x+1)*coarse/(2*fine);
+  }
+
+  __forceinline void stitchGridEdges(const unsigned int low_rate,
+                                     const unsigned int high_rate,
+                                     const unsigned int x0,
+                                     const unsigned int x1,
+				    float * __restrict__ const uv_array,
+				    const unsigned int uv_array_step)
+  {
+#if 1
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    for (unsigned x=x0; x<=x1; x++) {
+      uv_array[(x-x0)*uv_array_step] = float(stitch(x,high_rate-1,low_rate-1))*inv_low_rate;
+    }
+    if (unlikely(x1 == high_rate-1))
+      uv_array[(x1-x0)*uv_array_step] = 1.0f;
+#else
+    assert(low_rate < high_rate);
+    assert(high_rate >= 2);
+    
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    const unsigned int dy = low_rate  - 1; 
+    const unsigned int dx = high_rate - 1;
+    
+    int p = 2*dy-dx;  
+    
+    unsigned int offset = 0;
+    unsigned int y = 0;
+    float value = 0.0f;
+    for(unsigned int x=0;x<high_rate-1; x++) // '<=' would be correct but we will leave the 1.0f at the end
+    {
+      uv_array[offset] = value;
+      
+      offset += uv_array_step;      
+      if (unlikely(p > 0))
+      {
+	y++;
+	value = (float)y * inv_low_rate;
+	p -= 2*dx;
+      }
+      p += 2*dy;
+    }
+#endif
+  }
+  
+  __forceinline void stitchUVGrid(const float edge_levels[4],
+                                  const unsigned int swidth,
+                                  const unsigned int sheight,
+                                  const unsigned int x0,
+                                  const unsigned int y0,
+				  const unsigned int grid_u_res,
+				  const unsigned int grid_v_res,
+				  float * __restrict__ const u_array,
+				  float * __restrict__ const v_array)
+  {
+    const unsigned int x1 = x0+grid_u_res-1;
+    const unsigned int y1 = y0+grid_v_res-1;
+    const unsigned int int_edge_points0 = (unsigned int)edge_levels[0] + 1;
+    const unsigned int int_edge_points1 = (unsigned int)edge_levels[1] + 1;
+    const unsigned int int_edge_points2 = (unsigned int)edge_levels[2] + 1;
+    const unsigned int int_edge_points3 = (unsigned int)edge_levels[3] + 1;
+    
+    if (unlikely(y0 == 0 && int_edge_points0 < swidth))
+      stitchGridEdges(int_edge_points0,swidth,x0,x1,u_array,1);
+    
+    if (unlikely(y1 == sheight-1 && int_edge_points2 < swidth))
+      stitchGridEdges(int_edge_points2,swidth,x0,x1,&u_array[(grid_v_res-1)*grid_u_res],1);
+    
+    if (unlikely(x0 == 0 && int_edge_points1 < sheight))
+      stitchGridEdges(int_edge_points1,sheight,y0,y1,&v_array[grid_u_res-1],grid_u_res);
+    
+    if (unlikely(x1 == swidth-1 && int_edge_points3 < sheight))
+      stitchGridEdges(int_edge_points3,sheight,y0,y1,v_array,grid_u_res);  
+  }
+  
+  __forceinline void gridUVTessellator(const float edge_levels[4],  
+                                       const unsigned int swidth,
+                                       const unsigned int sheight,
+                                       const unsigned int x0,
+                                       const unsigned int y0,
+				       const unsigned int grid_u_res,
+				       const unsigned int grid_v_res,
+				       float * __restrict__ const u_array,
+				       float * __restrict__ const v_array)
+  {
+    assert( grid_u_res >= 1);
+    assert( grid_v_res >= 1);
+    assert( edge_levels[0] >= 1.0f );
+    assert( edge_levels[1] >= 1.0f );
+    assert( edge_levels[2] >= 1.0f );
+    assert( edge_levels[3] >= 1.0f );
+    
+#if defined(__AVX__)
+    const vint8 grid_u_segments = vint8(swidth)-1;
+    const vint8 grid_v_segments = vint8(sheight)-1;
+    
+    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments));
+    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint8 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint8 u_i ( step );
+      
+      const vbool8 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8)
+      {
+        const vbool8 m_u = u_i < grid_u_segments;
+	const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f);
+	vfloat8::storeu(&u_array[index + x],u);
+	vfloat8::storeu(&v_array[index + x],v);	   
+      }
+    }       
+ #else   
+    const vint4 grid_u_segments = vint4(swidth)-1;
+    const vint4 grid_v_segments = vint4(sheight)-1;
+    
+    const vfloat4 inv_grid_u_segments = rcp(vfloat4(grid_u_segments));
+    const vfloat4 inv_grid_v_segments = rcp(vfloat4(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint4 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint4 u_i ( step );
+      
+      const vbool4 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=4, u_i += 4)
+      {
+        const vbool4 m_u = u_i < grid_u_segments;
+	const vfloat4 u = select(m_u, vfloat4(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat4 v = select(m_v, vfloat4(y0+v_i) * inv_grid_v_segments, 1.0f);
+        vfloat4::storeu(&u_array[index + x],u);
+	vfloat4::storeu(&v_array[index + x],v);	   
+      }
+    }       
+#endif
+  } 
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
new file mode 100644
index 0000000000..5c215288b6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+/* force a complete cache invalidation when running out of allocation space */
+#define FORCE_SIMPLE_FLUSH 0
+
+#define THREAD_BLOCK_ATOMIC_ADD 4
+
+#if defined(DEBUG)
+#define CACHE_STATS(x) 
+#else
+#define CACHE_STATS(x) 
+#endif
+
+namespace embree
+{
+  class SharedTessellationCacheStats
+  {
+  public:
+    /* stats */
+    static std::atomic<size_t> cache_accesses;
+    static std::atomic<size_t> cache_hits;
+    static std::atomic<size_t> cache_misses;
+    static std::atomic<size_t> cache_flushes;                
+    static size_t        cache_num_patches;
+    __aligned(64) static SpinLock mtx;
+    
+    /* print stats for debugging */                 
+    static void printStats();
+    static void clearStats();
+  };
+  
+  void resizeTessellationCache(size_t new_size);
+  void resetTessellationCache();
+  
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(64) ThreadWorkState 
+ {
+   ALIGNED_STRUCT_(64);
+
+   std::atomic<size_t> counter;
+   ThreadWorkState* next;
+   bool allocated;
+
+   __forceinline ThreadWorkState(bool allocated = false) 
+     : counter(0), next(nullptr), allocated(allocated) 
+   {
+     assert( ((size_t)this % 64) == 0 ); 
+   }   
+ };
+
+ class __aligned(64) SharedLazyTessellationCache 
+ {
+ public:
+   
+   static const size_t NUM_CACHE_SEGMENTS              = 8;
+   static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
+   static const size_t COMMIT_INDEX_SHIFT              = 32+8;
+#if defined(__X86_64__) || defined(__aarch64__)
+   static const size_t REF_TAG_MASK                    = 0xffffffffff;
+#else
+   static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
+#endif
+   static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1;
+   static const size_t BLOCK_SIZE                      = 64;
+   
+
+    /*! Per thread tessellation ref cache */
+   static __thread ThreadWorkState* init_t_state;
+   static ThreadWorkState* current_t_state;
+   
+   static __forceinline ThreadWorkState *threadState() 
+   {
+     if (unlikely(!init_t_state))
+       /* sets init_t_state, can't return pointer due to macosx icc bug*/
+       SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
+     return init_t_state;
+   }
+
+   struct Tag
+   {
+     __forceinline Tag() : data(0) {}
+
+     __forceinline Tag(void* ptr, size_t combinedTime) { 
+       init(ptr,combinedTime);
+     }
+
+     __forceinline Tag(size_t ptr, size_t combinedTime) {
+       init((void*)ptr,combinedTime); 
+     }
+
+     __forceinline void init(void* ptr, size_t combinedTime)
+     {
+       if (ptr == nullptr) {
+         data = 0;
+         return;
+       }
+       int64_t new_root_ref = (int64_t) ptr;
+       new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                
+       assert( new_root_ref <= (int64_t)REF_TAG_MASK );
+       new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; 
+       data = new_root_ref;
+     }
+
+     __forceinline int64_t get() const { return data.load(); }
+     __forceinline void set( int64_t v ) { data.store(v); }
+     __forceinline void reset() { data.store(0); }
+
+   private:
+     atomic<int64_t> data;
+   };
+
+   static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
+
+   struct CacheEntry
+   {
+     Tag tag;
+     SpinLock mutex;
+   };
+
+ private:
+
+   float *data;
+   bool hugepages;
+   size_t size;
+   size_t maxBlocks;
+   ThreadWorkState *threadWorkState;
+      
+   __aligned(64) std::atomic<size_t> localTime;
+   __aligned(64) std::atomic<size_t> next_block;
+   __aligned(64) SpinLock   reset_state;
+   __aligned(64) SpinLock   linkedlist_mtx;
+   __aligned(64) std::atomic<size_t> switch_block_threshold;
+   __aligned(64) std::atomic<size_t> numRenderThreads;
+
+
+ public:
+
+      
+   SharedLazyTessellationCache();
+   ~SharedLazyTessellationCache();
+
+   void getNextRenderThreadWorkState();
+
+   __forceinline size_t maxAllocSize() const {
+     return switch_block_threshold;
+   }
+
+   __forceinline size_t getCurrentIndex() { return localTime.load(); }
+   __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
+
+   __forceinline size_t getTime(const size_t globalTime) {
+     return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
+   }
+
+
+   __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  }
+   __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
+
+   __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
+
+   static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); }
+   static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
+   static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
+   static __forceinline size_t getState() { return threadState()->counter.load(); }
+   static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
+
+   static __forceinline size_t getTCacheTime(const size_t globalTime) {
+     return sharedLazyTessellationCache.getTime(globalTime);
+   }
+
+   /* per thread lock */
+   __forceinline void lockThreadLoop (ThreadWorkState *const t_state) 
+   { 
+     while(1)
+     {
+       size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
+       if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
+       {
+         /* lock failed wait until sync phase is over */
+         sharedLazyTessellationCache.unlockThread(t_state,-1);	       
+         sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
+       }
+       else
+         break;
+     }
+   }
+
+   static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
+   {   
+     const int64_t subdiv_patch_root_ref = entry.tag.get(); 
+     CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
+     
+     if (likely(subdiv_patch_root_ref != 0)) 
+     {
+       const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
+       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+       
+       if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
+       {
+         CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
+         return (void*) subdiv_patch_root;
+       }
+     }
+     CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
+     return nullptr;
+   }
+
+   template<typename Constructor>
+     static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
+   {
+     ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
+
+     while (true)
+     {
+       sharedLazyTessellationCache.lockThreadLoop(t_state);
+       void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
+       if (patch) return (decltype(constructor())) patch;
+       
+       if (entry.mutex.try_lock())
+       {
+         if (!validTag(entry.tag,globalTime)) 
+         {
+           auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
+           auto ret = constructor(); // thread is locked here!
+           assert(ret);
+           /* this should never return nullptr */
+           auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
+           auto time = before ? timeBefore : timeAfter;
+           __memory_barrier();
+           entry.tag = SharedLazyTessellationCache::Tag(ret,time);
+           __memory_barrier();
+           entry.mutex.unlock();
+           return ret;
+         }
+         entry.mutex.unlock();
+       }
+       SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
+     }
+   }
+   
+   __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
+   {
+#if FORCE_SIMPLE_FLUSH == 1
+     return i == getTime(globalTime);
+#else
+     return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
+#endif
+   }
+
+   static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
+   {
+     return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
+   }
+
+
+    static __forceinline bool validTag(const Tag& tag, size_t globalTime)
+    {
+      const int64_t subdiv_patch_root_ref = tag.get(); 
+      if (subdiv_patch_root_ref == 0) return false;
+      const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+      return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
+    }
+
+   void waitForUsersLessEqual(ThreadWorkState *const t_state,
+			      const unsigned int users);
+    
+   __forceinline size_t alloc(const size_t blocks)
+   {
+     if (unlikely(blocks >= switch_block_threshold))
+       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
+
+     assert(blocks < switch_block_threshold);
+     size_t index = next_block.fetch_add(blocks);
+     if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
+     return index;
+   }
+
+   static __forceinline void* malloc(const size_t bytes)
+   {
+     size_t block_index = -1;
+     ThreadWorkState *const t_state = threadState();
+     while (true)
+     {
+       block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
+       if (block_index == (size_t)-1)
+       {
+         sharedLazyTessellationCache.unlockThread(t_state);		  
+         sharedLazyTessellationCache.allocNextSegment();
+         sharedLazyTessellationCache.lockThread(t_state);
+         continue; 
+       }
+       break;
+     }
+     return sharedLazyTessellationCache.getBlockPtr(block_index);
+   }
+
+   __forceinline void *getBlockPtr(const size_t block_index)
+   {
+     assert(block_index < maxBlocks);
+     assert(data);
+     assert(block_index*16 <= size);
+     return (void*)&data[block_index*16];
+   }
+
+   __forceinline void*  getDataPtr()      { return data; }
+   __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
+   __forceinline size_t getMaxBlocks()    { return maxBlocks; }
+   __forceinline size_t getSize()         { return size; }
+
+   void allocNextSegment();
+   void realloc(const size_t newSize);
+
+   void reset();
+
+   static SharedLazyTessellationCache sharedLazyTessellationCache;
+ };
+}
diff --git a/thirdparty/embree-aarch64/patches/godot-changes.patch b/thirdparty/embree-aarch64/patches/godot-changes.patch
new file mode 100644
index 0000000000..86fbf226d2
--- /dev/null
+++ b/thirdparty/embree-aarch64/patches/godot-changes.patch
@@ -0,0 +1,630 @@
+diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+index 76c6b740aa..51d296fb16 100644
+--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+@@ -27,7 +27,10 @@ namespace embree
+           func(r.begin());
+         });
+       if (!TaskScheduler::wait())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     }
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -55,13 +58,19 @@ namespace embree
+         func(i);
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+         func(i);
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -81,7 +90,10 @@ namespace embree
+ #if defined(TASKING_INTERNAL)
+     TaskScheduler::spawn(first,last,minStepSize,func);
+     if (!TaskScheduler::wait())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+ 
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -109,13 +121,19 @@ namespace embree
+         func(range<Index>(r.begin(),r.end()));
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+         func(range<Index>(r.begin(),r.end()));
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -147,13 +165,19 @@ namespace embree
+           func(i);
+         },tbb::simple_partitioner(),context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },tbb::simple_partitioner());
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
+   }
+ 
+@@ -168,13 +192,19 @@ namespace embree
+           func(i);
+         },ap,context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++       // -- GODOT start --
++       // throw std::runtime_error("task cancelled");
++       abort(); 
++       // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },ap);
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
+   }
+ 
+diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+index d444b6a2e4..0daf94e50e 100644
+--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+@@ -58,15 +58,19 @@ namespace embree
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction,context);
+-    if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (context.is_group_execution_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #else
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction);
+-    if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (tbb::task::self().is_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #endif
+ #else // TASKING_PPL
+diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
+index 7e7b9faef8..98dc80ad59 100644
+--- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
++++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
+@@ -39,7 +39,10 @@ namespace embree
+     std::vector<char> str; str.reserve(64);
+     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+       int c = cin->get();
+-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      // -- GODOT start --
++      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      if (!isValidChar(c)) abort();
++      // -- GODOT end --
+       str.push_back((char)c);
+     }
+     str.push_back(0);
+diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp
+index 4e8928242e..12f143f131 100644
+--- a/thirdparty/embree-aarch64/common/sys/alloc.cpp
++++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp
+@@ -21,7 +21,10 @@ namespace embree
+     void* ptr = _mm_malloc(size,align);
+ 
+     if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort(); 
++      // -- GODOT end --
+     
+     return ptr;
+   }
+@@ -128,7 +131,10 @@ namespace embree
+     /* fall back to 4k pages */
+     int flags = MEM_COMMIT | MEM_RESERVE;
+     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+-    if (ptr == nullptr) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == nullptr) throw std::bad_alloc();
++    if (ptr == nullptr) abort();
++    // -- GODOT end --
+     hugepages = false;
+     return ptr;
+   }
+@@ -145,7 +151,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -156,7 +165,10 @@ namespace embree
+       return;
+ 
+     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   void os_advise(void *ptr, size_t bytes)
+@@ -260,7 +272,10 @@ namespace embree
+ 
+     /* fallback to 4k pages */
+     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+-    if (ptr == MAP_FAILED) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == MAP_FAILED) throw std::bad_alloc();
++    if (ptr == MAP_FAILED) abort();
++    // -- GODOT end --
+     hugepages = false;
+ 
+     /* advise huge page hint for THP */
+@@ -277,7 +292,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -291,7 +309,10 @@ namespace embree
+     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+     bytes = (bytes+pageSize-1) & ~(pageSize-1);
+     if (munmap(ptr,bytes) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h
+index 7914eb7a52..737f14aa6e 100644
+--- a/thirdparty/embree-aarch64/common/sys/platform.h
++++ b/thirdparty/embree-aarch64/common/sys/platform.h
+@@ -174,11 +174,19 @@
+ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(str);
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define FATAL(x)   THROW_RUNTIME_ERROR(x)
+diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
+index 98d7fb9249..ebf656d1a0 100644
+--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
+@@ -48,13 +48,15 @@ namespace embree
+     {
+       Task* prevTask = thread.task;
+       thread.task = this;
+-      try {
+-        if (thread.scheduler->cancellingException == nullptr)
++      // -- GODOT start --
++      // try {
++      // if (thread.scheduler->cancellingException == nullptr)
+           closure->execute();
+-      } catch (...) {
+-        if (thread.scheduler->cancellingException == nullptr)
+-          thread.scheduler->cancellingException = std::current_exception();
+-      }
++      // } catch (...) {
++      //   if (thread.scheduler->cancellingException == nullptr)
++      //     thread.scheduler->cancellingException = std::current_exception();
++      // }
++      // -- GODOT end --
+       thread.task = prevTask;
+       add_dependencies(-1);
+     }
+@@ -297,8 +299,11 @@ namespace embree
+     size_t threadIndex = allocThreadIndex();
+     condition.wait(mutex, [&] () { return hasRootTask.load(); });
+     mutex.unlock();
+-    std::exception_ptr except = thread_loop(threadIndex);
+-    if (except != nullptr) std::rethrow_exception(except);
++    // -- GODOT start --
++    // std::exception_ptr except = thread_loop(threadIndex);
++    // if (except != nullptr) std::rethrow_exception(except);
++    thread_loop(threadIndex);
++    // -- GODOT end --
+   }
+ 
+   void TaskScheduler::reset() {
+@@ -330,7 +335,10 @@ namespace embree
+     return thread->scheduler->cancellingException == nullptr;
+   }
+ 
+-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT start --
++//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++  void TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT end --
+   {
+     /* allocate thread structure */
+     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+@@ -353,9 +361,10 @@ namespace embree
+     swapThread(oldThread);
+ 
+     /* remember exception to throw */
+-    std::exception_ptr except = nullptr;
+-    if (cancellingException != nullptr) except = cancellingException;
+-
++    // -- GODOT start --
++    // std::exception_ptr except = nullptr;
++    // if (cancellingException != nullptr) except = cancellingException;
++    // -- GODOT end --
+     /* wait for all threads to terminate */
+     threadCounter--;
+ #if defined(__WIN32__)
+@@ -373,7 +382,10 @@ namespace embree
+           yield();
+ #endif
+ 	}
+-    return except;
++    // -- GODOT start --
++    // return except;
++    return;
++    // -- GODOT end --
+   }
+ 
+   bool TaskScheduler::steal_from_other_threads(Thread& thread)
+diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
+index c2a9391aea..8bd70b2b8c 100644
+--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
+@@ -123,7 +123,10 @@ namespace embree
+       {
+         size_t ofs = bytes + ((align - stackPtr) & (align-1));
+         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-          throw std::runtime_error("closure stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("closure stack overflow");
++          abort();
++          // -- GODOT end --
+         stackPtr += ofs;
+         return &stack[stackPtr-bytes];
+       }
+@@ -132,7 +135,10 @@ namespace embree
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+       {
+         if (right >= TASK_STACK_SIZE)
+-          throw std::runtime_error("task stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("task stack overflow");
++          abort();
++          // -- GODOT end --
+ 
+ 	/* allocate new task on right side of stack */
+         size_t oldStackPtr = stackPtr;
+@@ -239,7 +245,10 @@ namespace embree
+     void wait_for_threads(size_t threadCount);
+ 
+     /*! thread loop for all worker threads */
+-    std::exception_ptr thread_loop(size_t threadIndex);
++    // -- GODOT start --
++    // std::exception_ptr thread_loop(size_t threadIndex);
++    void thread_loop(size_t threadIndex);
++    // -- GODOT end --
+ 
+     /*! steals a task from a different thread */
+     bool steal_from_other_threads(Thread& thread);
+diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
+index 20cdd2d320..aa56035026 100644
+--- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
++++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
+@@ -150,7 +150,10 @@ namespace embree
+       }
+     }
+     else {
+-      throw std::runtime_error("not supported node type in bvh_statistics");
++      // -- GODOT start --
++      // throw std::runtime_error("not supported node type in bvh_statistics");
++      abort();
++      // -- GODOT end --
+     }
+     return s;
+   } 
+diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
+index ee5c37b238..625fbf6d4f 100644
+--- a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
+@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
+     if (quality != RTC_BUILD_QUALITY_LOW &&
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     scene->setBuildQuality(quality);
+     RTC_CATCH_END2(scene);
+   }
+@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH &&
+         quality != RTC_BUILD_QUALITY_REFIT)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     geometry->setBuildQuality(quality);
+     RTC_CATCH_END2(geometry);
+   }
+diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
+index 6583d12d57..4b070e122b 100644
+--- a/thirdparty/embree-aarch64/kernels/common/rtcore.h
++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
+@@ -25,52 +25,58 @@ namespace embree
+ #endif
+ 
+ /*! Macros used in the rtcore API implementation */
+-#define RTC_CATCH_BEGIN try {
++// -- GODOT start --
++// #define RTC_CATCH_BEGIN try {
++#define RTC_CATCH_BEGIN
+   
+-#define RTC_CATCH_END(device)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END(device)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END(device)
+   
+-#define RTC_CATCH_END2(scene)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END2(scene)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END2(scene)
+ 
+-#define RTC_CATCH_END2_FALSE(scene)                                             \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-    return false;                                                               \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-    return false;                                                               \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-    return false;                                                               \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-    return false;                                                               \
+-  }
++// #define RTC_CATCH_END2_FALSE(scene)                                             \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//     return false;                                                               \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//     return false;                                                               \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//     return false;                                                               \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//     return false;                                                               \
++//   }
++#define RTC_CATCH_END2_FALSE(scene) return false;
++// -- GODOT end --
+ 
+ #define RTC_VERIFY_HANDLE(handle)                               \
+   if (handle == nullptr) {                                         \
+@@ -97,28 +103,38 @@ namespace embree
+ #define RTC_TRACE(x) 
+ #endif
+ 
+-  /*! used to throw embree API errors */
+-  struct rtcore_error : public std::exception
+-  {
+-    __forceinline rtcore_error(RTCError error, const std::string& str)
+-      : error(error), str(str) {}
+-    
+-    ~rtcore_error() throw() {}
+-    
+-    const char* what () const throw () {
+-      return str.c_str();
+-    }
+-    
+-    RTCError error;
+-    std::string str;
+-  };
++// -- GODOT begin --
++//   /*! used to throw embree API errors */
++//   struct rtcore_error : public std::exception
++//   {
++//     __forceinline rtcore_error(RTCError error, const std::string& str)
++//       : error(error), str(str) {}
++//     
++//     ~rtcore_error() throw() {}
++//     
++//     const char* what () const throw () {
++//       return str.c_str();
++//     }
++//     
++//     RTCError error;
++//     std::string str;
++//   };
++// -- GODOT end --
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,str);
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
+index e75aa968f9..1e23aeb415 100644
+--- a/thirdparty/embree-aarch64/kernels/common/scene.cpp
++++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
+@@ -800,16 +800,18 @@ namespace embree
+     }
+ 
+     /* initiate build */
+-    try {
++    // -- GODOT start --
++    // try {
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+-    }
+-    catch (...) {
+-      accels_clear();
+-      updateInterface();
+-      Lock<MutexSys> lock(schedulerMutex);
+-      this->scheduler = nullptr;
+-      throw;
+-    }
++    // }
++    // catch (...) {
++    //   accels_clear();
++    //   updateInterface();
++    //   Lock<MutexSys> lock(schedulerMutex);
++    //   this->scheduler = nullptr;
++    //   throw;
++    // }
++    // -- GODOT end --
+   }
+ 
+ #endif
diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp
index 6971ece8d3..189de6cc1f 100644
--- a/thirdparty/enet/godot.cpp
+++ b/thirdparty/enet/godot.cpp
@@ -228,7 +228,7 @@ public:
 		ERR_FAIL_COND_V(err != OK, err);
 		ERR_FAIL_COND_V(p_len < r_read, ERR_OUT_OF_MEMORY);
 
-		copymem(p_buffer, buffer, r_read);
+		memcpy(p_buffer, buffer, r_read);
 		r_ip = udp->get_packet_address();
 		r_port = udp->get_packet_port();
 		return err;
@@ -345,7 +345,7 @@ public:
 				Vector<String> s = E->key().rsplit(":", false, 1);
 				ERR_CONTINUE(s.size() != 2); // BUG!
 
-				copymem(p_buffer, buffer, r_read);
+				memcpy(p_buffer, buffer, r_read);
 				r_ip = s[0];
 				r_port = s[1].to_int();
 				break; // err = OK