245 files changed, 11328 insertions, 8057 deletions
diff --git a/core/math/color.h b/core/math/color.h
index a23a4953ce..5630539aa7 100644
--- a/core/math/color.h
+++ b/core/math/color.h
@@ -105,12 +105,10 @@ struct _NO_DISCARD_ Color {
 
 	_FORCE_INLINE_ Color lerp(const Color &p_to, float p_weight) const {
 		Color res = *this;
-
-		res.r += (p_weight * (p_to.r - r));
-		res.g += (p_weight * (p_to.g - g));
-		res.b += (p_weight * (p_to.b - b));
-		res.a += (p_weight * (p_to.a - a));
-
+		res.r = Math::lerp(res.r, p_to.r, p_weight);
+		res.g = Math::lerp(res.g, p_to.g, p_weight);
+		res.b = Math::lerp(res.b, p_to.b, p_weight);
+		res.a = Math::lerp(res.a, p_to.a, p_weight);
 		return res;
 	}
 
diff --git a/core/math/vector2.h b/core/math/vector2.h
index 1266561a81..835c3d1ba6 100644
--- a/core/math/vector2.h
+++ b/core/math/vector2.h
@@ -243,10 +243,8 @@ _FORCE_INLINE_ bool Vector2::operator!=(const Vector2 &p_vec2) const {
 
 Vector2 Vector2::lerp(const Vector2 &p_to, const real_t p_weight) const {
 	Vector2 res = *this;
-
-	res.x += (p_weight * (p_to.x - x));
-	res.y += (p_weight * (p_to.y - y));
-
+	res.x = Math::lerp(res.x, p_to.x, p_weight);
+	res.y = Math::lerp(res.y, p_to.y, p_weight);
 	return res;
 }
 
@@ -279,27 +277,16 @@ Vector2 Vector2::cubic_interpolate_in_time(const Vector2 &p_b, const Vector2 &p_
 
 Vector2 Vector2::bezier_interpolate(const Vector2 &p_control_1, const Vector2 &p_control_2, const Vector2 &p_end, const real_t p_t) const {
 	Vector2 res = *this;
-
-	/* Formula from Wikipedia article on Bezier curves. */
-	real_t omt = (1.0 - p_t);
-	real_t omt2 = omt * omt;
-	real_t omt3 = omt2 * omt;
-	real_t t2 = p_t * p_t;
-	real_t t3 = t2 * p_t;
-
-	return res * omt3 + p_control_1 * omt2 * p_t * 3.0 + p_control_2 * omt * t2 * 3.0 + p_end * t3;
+	res.x = Math::bezier_interpolate(res.x, p_control_1.x, p_control_2.x, p_end.x, p_t);
+	res.y = Math::bezier_interpolate(res.y, p_control_1.y, p_control_2.y, p_end.y, p_t);
+	return res;
 }
 
 Vector2 Vector2::bezier_derivative(const Vector2 &p_control_1, const Vector2 &p_control_2, const Vector2 &p_end, const real_t p_t) const {
 	Vector2 res = *this;
-
-	/* Formula from Wikipedia article on Bezier curves. */
-	real_t omt = (1.0 - p_t);
-	real_t omt2 = omt * omt;
-	real_t t2 = p_t * p_t;
-
-	Vector2 d = (p_control_1 - res) * 3.0 * omt2 + (p_control_2 - p_control_1) * 6.0 * omt * p_t + (p_end - p_control_2) * 3.0 * t2;
-	return d;
+	res.x = Math::bezier_derivative(res.x, p_control_1.x, p_control_2.x, p_end.x, p_t);
+	res.y = Math::bezier_derivative(res.y, p_control_1.y, p_control_2.y, p_end.y, p_t);
+	return res;
 }
 
 Vector2 Vector2::direction_to(const Vector2 &p_to) const {
diff --git a/core/math/vector2i.cpp b/core/math/vector2i.cpp
index dfed42e4d6..ff8693ee5b 100644
--- a/core/math/vector2i.cpp
+++ b/core/math/vector2i.cpp
@@ -39,6 +39,12 @@ Vector2i Vector2i::clamp(const Vector2i &p_min, const Vector2i &p_max) const {
 			CLAMP(y, p_min.y, p_max.y));
 }
 
+Vector2i Vector2i::snapped(const Vector2i &p_step) const {
+	return Vector2i(
+			Math::snapped(x, p_step.x),
+			Math::snapped(y, p_step.y));
+}
+
 int64_t Vector2i::length_squared() const {
 	return x * (int64_t)x + y * (int64_t)y;
 }
diff --git a/core/math/vector2i.h b/core/math/vector2i.h
index e131bdea94..927be11030 100644
--- a/core/math/vector2i.h
+++ b/core/math/vector2i.h
@@ -119,6 +119,7 @@ struct _NO_DISCARD_ Vector2i {
 	Vector2i sign() const { return Vector2i(SIGN(x), SIGN(y)); }
 	Vector2i abs() const { return Vector2i(Math::abs(x), Math::abs(y)); }
 	Vector2i clamp(const Vector2i &p_min, const Vector2i &p_max) const;
+	Vector2i snapped(const Vector2i &p_step) const;
 
 	operator String() const;
 	operator Vector2() const;
diff --git a/core/math/vector3.h b/core/math/vector3.h
index f5fe76a92c..dc74096690 100644
--- a/core/math/vector3.h
+++ b/core/math/vector3.h
@@ -209,10 +209,11 @@ Vector3 Vector3::round() const {
 }
 
 Vector3 Vector3::lerp(const Vector3 &p_to, const real_t p_weight) const {
-	return Vector3(
-			x + (p_weight * (p_to.x - x)),
-			y + (p_weight * (p_to.y - y)),
-			z + (p_weight * (p_to.z - z)));
+	Vector3 res = *this;
+	res.x = Math::lerp(res.x, p_to.x, p_weight);
+	res.y = Math::lerp(res.y, p_to.y, p_weight);
+	res.z = Math::lerp(res.z, p_to.z, p_weight);
+	return res;
 }
 
 Vector3 Vector3::slerp(const Vector3 &p_to, const real_t p_weight) const {
@@ -255,27 +256,18 @@ Vector3 Vector3::cubic_interpolate_in_time(const Vector3 &p_b, const Vector3 &p_
 
 Vector3 Vector3::bezier_interpolate(const Vector3 &p_control_1, const Vector3 &p_control_2, const Vector3 &p_end, const real_t p_t) const {
 	Vector3 res = *this;
-
-	/* Formula from Wikipedia article on Bezier curves. */
-	real_t omt = (1.0 - p_t);
-	real_t omt2 = omt * omt;
-	real_t omt3 = omt2 * omt;
-	real_t t2 = p_t * p_t;
-	real_t t3 = t2 * p_t;
-
-	return res * omt3 + p_control_1 * omt2 * p_t * 3.0 + p_control_2 * omt * t2 * 3.0 + p_end * t3;
+	res.x = Math::bezier_interpolate(res.x, p_control_1.x, p_control_2.x, p_end.x, p_t);
+	res.y = Math::bezier_interpolate(res.y, p_control_1.y, p_control_2.y, p_end.y, p_t);
+	res.z = Math::bezier_interpolate(res.z, p_control_1.z, p_control_2.z, p_end.z, p_t);
+	return res;
 }
 
 Vector3 Vector3::bezier_derivative(const Vector3 &p_control_1, const Vector3 &p_control_2, const Vector3 &p_end, const real_t p_t) const {
 	Vector3 res = *this;
-
-	/* Formula from Wikipedia article on Bezier curves. */
-	real_t omt = (1.0 - p_t);
-	real_t omt2 = omt * omt;
-	real_t t2 = p_t * p_t;
-
-	Vector3 d = (p_control_1 - res) * 3.0 * omt2 + (p_control_2 - p_control_1) * 6.0 * omt * p_t + (p_end - p_control_2) * 3.0 * t2;
-	return d;
+	res.x = Math::bezier_derivative(res.x, p_control_1.x, p_control_2.x, p_end.x, p_t);
+	res.y = Math::bezier_derivative(res.y, p_control_1.y, p_control_2.y, p_end.y, p_t);
+	res.z = Math::bezier_derivative(res.z, p_control_1.z, p_control_2.z, p_end.z, p_t);
+	return res;
 }
 
 real_t Vector3::distance_to(const Vector3 &p_to) const {
diff --git a/core/math/vector3i.cpp b/core/math/vector3i.cpp
index b248f35035..901f2b5a64 100644
--- a/core/math/vector3i.cpp
+++ b/core/math/vector3i.cpp
@@ -48,6 +48,13 @@ Vector3i Vector3i::clamp(const Vector3i &p_min, const Vector3i &p_max) const {
 			CLAMP(z, p_min.z, p_max.z));
 }
 
+Vector3i Vector3i::snapped(const Vector3i &p_step) const {
+	return Vector3i(
+			Math::snapped(x, p_step.x),
+			Math::snapped(y, p_step.y),
+			Math::snapped(z, p_step.z));
+}
+
 Vector3i::operator String() const {
 	return "(" + itos(x) + ", " + itos(y) + ", " + itos(z) + ")";
 }
diff --git a/core/math/vector3i.h b/core/math/vector3i.h
index 710fd96376..36bac3d8ae 100644
--- a/core/math/vector3i.h
+++ b/core/math/vector3i.h
@@ -77,6 +77,7 @@ struct _NO_DISCARD_ Vector3i {
 	_FORCE_INLINE_ Vector3i abs() const;
 	_FORCE_INLINE_ Vector3i sign() const;
 	Vector3i clamp(const Vector3i &p_min, const Vector3i &p_max) const;
+	Vector3i snapped(const Vector3i &p_step) const;
 
 	/* Operators */
 
diff --git a/core/math/vector4.cpp b/core/math/vector4.cpp
index 3b189f7ed4..5ddf2bb6f6 100644
--- a/core/math/vector4.cpp
+++ b/core/math/vector4.cpp
@@ -130,11 +130,12 @@ Vector4 Vector4::round() const {
 }
 
 Vector4 Vector4::lerp(const Vector4 &p_to, const real_t p_weight) const {
-	return Vector4(
-			x + (p_weight * (p_to.x - x)),
-			y + (p_weight * (p_to.y - y)),
-			z + (p_weight * (p_to.z - z)),
-			w + (p_weight * (p_to.w - w)));
+	Vector4 res = *this;
+	res.x = Math::lerp(res.x, p_to.x, p_weight);
+	res.y = Math::lerp(res.y, p_to.y, p_weight);
+	res.z = Math::lerp(res.z, p_to.z, p_weight);
+	res.w = Math::lerp(res.w, p_to.w, p_weight);
+	return res;
 }
 
 Vector4 Vector4::cubic_interpolate(const Vector4 &p_b, const Vector4 &p_pre_a, const Vector4 &p_post_b, const real_t p_weight) const {
diff --git a/core/math/vector4i.cpp b/core/math/vector4i.cpp
index 77f6fbd5b7..e906ab45ad 100644
--- a/core/math/vector4i.cpp
+++ b/core/math/vector4i.cpp
@@ -65,6 +65,14 @@ Vector4i Vector4i::clamp(const Vector4i &p_min, const Vector4i &p_max) const {
 			CLAMP(w, p_min.w, p_max.w));
 }
 
+Vector4i Vector4i::snapped(const Vector4i &p_step) const {
+	return Vector4i(
+			Math::snapped(x, p_step.x),
+			Math::snapped(y, p_step.y),
+			Math::snapped(z, p_step.z),
+			Math::snapped(w, p_step.w));
+}
+
 Vector4i::operator String() const {
 	return "(" + itos(x) + ", " + itos(y) + ", " + itos(z) + ", " + itos(w) + ")";
 }
diff --git a/core/math/vector4i.h b/core/math/vector4i.h
index a32414bb18..cb5a48daf9 100644
--- a/core/math/vector4i.h
+++ b/core/math/vector4i.h
@@ -79,6 +79,7 @@ struct _NO_DISCARD_ Vector4i {
 	_FORCE_INLINE_ Vector4i abs() const;
 	_FORCE_INLINE_ Vector4i sign() const;
 	Vector4i clamp(const Vector4i &p_min, const Vector4i &p_max) const;
+	Vector4i snapped(const Vector4i &p_step) const;
 
 	/* Operators */
 
diff --git a/core/variant/variant_call.cpp b/core/variant/variant_call.cpp
index 8f0e47a520..1e80e2681d 100644
--- a/core/variant/variant_call.cpp
+++ b/core/variant/variant_call.cpp
@@ -1647,6 +1647,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(Vector2i, sign, sarray(), varray());
 	bind_method(Vector2i, abs, sarray(), varray());
 	bind_method(Vector2i, clamp, sarray("min", "max"), varray());
+	bind_method(Vector2i, snapped, sarray("step"), varray());
 
 	/* Rect2 */
 
@@ -1736,6 +1737,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(Vector3i, sign, sarray(), varray());
 	bind_method(Vector3i, abs, sarray(), varray());
 	bind_method(Vector3i, clamp, sarray("min", "max"), varray());
+	bind_method(Vector3i, snapped, sarray("step"), varray());
 
 	/* Vector4 */
 
@@ -1775,6 +1777,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(Vector4i, sign, sarray(), varray());
 	bind_method(Vector4i, abs, sarray(), varray());
 	bind_method(Vector4i, clamp, sarray("min", "max"), varray());
+	bind_method(Vector4i, snapped, sarray("step"), varray());
 
 	/* Plane */
 
diff --git a/core/variant/variant_utility.cpp b/core/variant/variant_utility.cpp
index bf4f761f2b..4b82981fa9 100644
--- a/core/variant/variant_utility.cpp
+++ b/core/variant/variant_utility.cpp
@@ -322,8 +322,52 @@ struct VariantUtilityFunctions {
 		return Math::step_decimals(step);
 	}
 
-	static inline double snapped(double value, double step) {
-		return Math::snapped(value, step);
+	static inline Variant snapped(const Variant &x, const Variant &step, Callable::CallError &r_error) {
+		r_error.error = Callable::CallError::CALL_OK;
+		if (x.get_type() != step.get_type() && !((x.get_type() == Variant::INT && step.get_type() == Variant::FLOAT) || (x.get_type() == Variant::FLOAT && step.get_type() == Variant::INT))) {
+			r_error.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
+			r_error.argument = 1;
+			return Variant();
+		}
+
+		switch (step.get_type()) {
+			case Variant::INT: {
+				return snappedi(x, VariantInternalAccessor<int64_t>::get(&step));
+			} break;
+			case Variant::FLOAT: {
+				return snappedf(x, VariantInternalAccessor<double>::get(&step));
+			} break;
+			case Variant::VECTOR2: {
+				return VariantInternalAccessor<Vector2>::get(&x).snapped(VariantInternalAccessor<Vector2>::get(&step));
+			} break;
+			case Variant::VECTOR2I: {
+				return VariantInternalAccessor<Vector2i>::get(&x).snapped(VariantInternalAccessor<Vector2i>::get(&step));
+			} break;
+			case Variant::VECTOR3: {
+				return VariantInternalAccessor<Vector3>::get(&x).snapped(VariantInternalAccessor<Vector3>::get(&step));
+			} break;
+			case Variant::VECTOR3I: {
+				return VariantInternalAccessor<Vector3i>::get(&x).snapped(VariantInternalAccessor<Vector3i>::get(&step));
+			} break;
+			case Variant::VECTOR4: {
+				return VariantInternalAccessor<Vector4>::get(&x).snapped(VariantInternalAccessor<Vector4>::get(&step));
+			} break;
+			case Variant::VECTOR4I: {
+				return VariantInternalAccessor<Vector4i>::get(&x).snapped(VariantInternalAccessor<Vector4i>::get(&step));
+			} break;
+			default: {
+				r_error.error = Callable::CallError::CALL_ERROR_INVALID_METHOD;
+				return Variant();
+			}
+		}
+	}
+
+	static inline double snappedf(double x, double step) {
+		return Math::snapped(x, step);
+	}
+
+	static inline int64_t snappedi(double x, int64_t step) {
+		return Math::snapped(x, step);
 	}
 
 	static inline Variant lerp(const Variant &from, const Variant &to, double weight, Callable::CallError &r_error) {
@@ -1139,6 +1183,40 @@ static _FORCE_INLINE_ Variant::Type get_ret_type_helper(void (*p_func)(P...)) {
 	};                                                                                                                  \
 	register_utility_function<Func_##m_func>(#m_func, m_args)
 
+#define FUNCBINDVR2(m_func, m_args, m_category)                                                                                    \
+	class Func_##m_func {                                                                                                          \
+	public:                                                                                                                        \
+		static void call(Variant *r_ret, const Variant **p_args, int p_argcount, Callable::CallError &r_error) {                   \
+			r_error.error = Callable::CallError::CALL_OK;                                                                          \
+			*r_ret = VariantUtilityFunctions::m_func(*p_args[0], *p_args[1], r_error);                                             \
+		}                                                                                                                          \
+		static void validated_call(Variant *r_ret, const Variant **p_args, int p_argcount) {                                       \
+			Callable::CallError ce;                                                                                                \
+			*r_ret = VariantUtilityFunctions::m_func(*p_args[0], *p_args[1], ce);                                                  \
+		}                                                                                                                          \
+		static void ptrcall(void *ret, const void **p_args, int p_argcount) {                                                      \
+			Callable::CallError ce;                                                                                                \
+			Variant r;                                                                                                             \
+			r = VariantUtilityFunctions::m_func(PtrToArg<Variant>::convert(p_args[0]), PtrToArg<Variant>::convert(p_args[1]), ce); \
+			PtrToArg<Variant>::encode(r, ret);                                                                                     \
+		}                                                                                                                          \
+		static int get_argument_count() {                                                                                          \
+			return 2;                                                                                                              \
+		}                                                                                                                          \
+		static Variant::Type get_argument_type(int p_arg) {                                                                        \
+			return Variant::NIL;                                                                                                   \
+		}                                                                                                                          \
+		static Variant::Type get_return_type() {                                                                                   \
+			return Variant::NIL;                                                                                                   \
+		}                                                                                                                          \
+		static bool has_return_type() {                                                                                            \
+			return true;                                                                                                           \
+		}                                                                                                                          \
+		static bool is_vararg() { return false; }                                                                                  \
+		static Variant::UtilityFunctionType get_type() { return m_category; }                                                      \
+	};                                                                                                                             \
+	register_utility_function<Func_##m_func>(#m_func, m_args)
+
 #define FUNCBINDVR3(m_func, m_args, m_category)                                                                                                                           \
 	class Func_##m_func {                                                                                                                                                 \
 	public:                                                                                                                                                               \
@@ -1422,6 +1500,10 @@ void Variant::_register_variant_utility_functions() {
 	FUNCBINDR(signf, sarray("x"), Variant::UTILITY_FUNC_TYPE_MATH);
 	FUNCBINDR(signi, sarray("x"), Variant::UTILITY_FUNC_TYPE_MATH);
 
+	FUNCBINDVR2(snapped, sarray("x", "step"), Variant::UTILITY_FUNC_TYPE_MATH);
+	FUNCBINDR(snappedf, sarray("x", "step"), Variant::UTILITY_FUNC_TYPE_MATH);
+	FUNCBINDR(snappedi, sarray("x", "step"), Variant::UTILITY_FUNC_TYPE_MATH);
+
 	FUNCBINDR(pow, sarray("base", "exp"), Variant::UTILITY_FUNC_TYPE_MATH);
 	FUNCBINDR(log, sarray("x"), Variant::UTILITY_FUNC_TYPE_MATH);
 	FUNCBINDR(exp, sarray("x"), Variant::UTILITY_FUNC_TYPE_MATH);
@@ -1435,7 +1517,6 @@ void Variant::_register_variant_utility_functions() {
 
 	FUNCBINDR(ease, sarray("x", "curve"), Variant::UTILITY_FUNC_TYPE_MATH);
 	FUNCBINDR(step_decimals, sarray("x"), Variant::UTILITY_FUNC_TYPE_MATH);
-	FUNCBINDR(snapped, sarray("x", "step"), Variant::UTILITY_FUNC_TYPE_MATH);
 
 	FUNCBINDVR3(lerp, sarray("from", "to", "weight"), Variant::UTILITY_FUNC_TYPE_MATH);
 	FUNCBINDR(lerpf, sarray("from", "to", "weight"), Variant::UTILITY_FUNC_TYPE_MATH);
diff --git a/doc/classes/@GlobalScope.xml b/doc/classes/@GlobalScope.xml
index a85532dba6..33afe38446 100644
--- a/doc/classes/@GlobalScope.xml
+++ b/doc/classes/@GlobalScope.xml
@@ -16,7 +16,7 @@
 			<return type="Variant" />
 			<param index="0" name="x" type="Variant" />
 			<description>
-				Returns the absolute value of a [Variant] parameter [param x] (i.e. non-negative value). Variant types [int], [float], [Vector2], [Vector2i], [Vector3] and [Vector3i] are supported.
+				Returns the absolute value of a [Variant] parameter [param x] (i.e. non-negative value). Supported types: [int], [float], [Vector2], [Vector2i], [Vector3], [Vector3i], [Vector4], [Vector4i].
 				[codeblock]
 				var a = abs(-1)
 				# a is 1
@@ -36,6 +36,7 @@
 				var f = abs(Vector3i(-7, -8, -9))
 				# f is (7, 8, 9)
 				[/codeblock]
+				[b]Note:[/b] For better type safety, use [method absf], [method absi], [method Vector2.abs], [method Vector2i.abs], [method Vector3.abs], [method Vector3i.abs], [method Vector4.abs], or [method Vector4i.abs].
 			</description>
 		</method>
 		<method name="absf">
@@ -154,7 +155,7 @@
 				i = ceil(1.001)    # i is 2.0
 				[/codeblock]
 				See also [method floor], [method round], and [method snapped].
-				[b]Note:[/b] For better type safety, see [method ceilf], [method ceili], [method Vector2.ceil], [method Vector3.ceil] and [method Vector4.ceil].
+				[b]Note:[/b] For better type safety, use [method ceilf], [method ceili], [method Vector2.ceil], [method Vector3.ceil], or [method Vector4.ceil].
 			</description>
 		</method>
 		<method name="ceilf">
@@ -179,7 +180,7 @@
 			<param index="1" name="min" type="Variant" />
 			<param index="2" name="max" type="Variant" />
 			<description>
-				Clamps the [param value], returning a [Variant] not less than [param min] and not more than [param max]. Variant types [int], [float], [Vector2], [Vector2i], [Vector3] and [Vector3i] are supported.
+				Clamps the [param value], returning a [Variant] not less than [param min] and not more than [param max]. Supported types: [int], [float], [Vector2], [Vector2i], [Vector3], [Vector3i], [Vector4], [Vector4i].
 				[codeblock]
 				var a = clamp(-10, -1, 5)
 				# a is -1
@@ -199,6 +200,7 @@
 				var f = clamp(Vector3i(-7, -8, -9), Vector3i(-1, 2, 3), Vector3i(-4, -5, -6))
 				# f is (-4, -5, -6)
 				[/codeblock]
+				[b]Note:[/b] For better type safety, use [method clampf], [method clampi], [method Vector2.clamp], [method Vector2i.clamp], [method Vector3.clamp], [method Vector3i.clamp], [method Vector4.clamp], or [method Vector4i.clamp].
 			</description>
 		</method>
 		<method name="clampf">
@@ -378,7 +380,7 @@
 				a = floor(-2.99)    # a is -3.0
 				[/codeblock]
 				See also [method ceil], [method round], and [method snapped].
-				[b]Note:[/b] For better type safety, see [method floorf], [method floori], [method Vector2.floor], [method Vector3.floor] and [method Vector4.floor].
+				[b]Note:[/b] For better type safety, use [method floorf], [method floori], [method Vector2.floor], [method Vector3.floor], or [method Vector4.floor].
 			</description>
 		</method>
 		<method name="floorf">
@@ -794,14 +796,13 @@
 		</method>
 		<method name="printraw" qualifiers="vararg">
 			<description>
-				Prints one or more arguments to strings in the best way possible to console. Unlike [method print], no newline is automatically added at the end.
+				Prints one or more arguments to strings in the best way possible to the OS terminal. Unlike [method print], no newline is automatically added at the end.
 				[codeblock]
 				printraw("A")
 				printraw("B")
 				printraw("C")
-				# Prints ABC
+				# Prints ABC to terminal
 				[/codeblock]
-				[b]Note:[/b] Due to limitations with Godot's built-in console, this only prints to the terminal. If you need to print in the editor, use another method, such as [method print].
 			</description>
 		</method>
 		<method name="prints" qualifiers="vararg">
@@ -961,7 +962,7 @@
 				round(2.6) # Returns 3
 				[/codeblock]
 				See also [method floor], [method ceil], and [method snapped].
-				[b]Note:[/b] For better type safety, use [method roundf], [method roundi], [method Vector2.round], [method Vector3.round] or [method Vector4.round], instead.
+				[b]Note:[/b] For better type safety, use [method roundf], [method roundi], [method Vector2.round], [method Vector3.round], or [method Vector4.round].
 			</description>
 		</method>
 		<method name="roundf">
@@ -998,21 +999,22 @@
 			<return type="Variant" />
 			<param index="0" name="x" type="Variant" />
 			<description>
-				Returns the sign of [param x] as same type of [Variant] as [param x] with each component being -1, 0 and 1 for each negative, zero and positive values respectively. Variant types [int], [float], [Vector2], [Vector2i], [Vector3] and [Vector3i] are supported.
+				Returns the same type of [Variant] as [param x], with [code]-1[/code] for negative values, [code]1[/code] for positive values, and [code]0[/code] for zeros. Supported types: [int], [float], [Vector2], [Vector2i], [Vector3], [Vector3i], [Vector4], [Vector4i].
 				[codeblock]
 				sign(-6.0) # Returns -1
 				sign(0.0)  # Returns 0
 				sign(6.0)  # Returns 1
 
-				sign(Vector3(-6.0, 0.0, 6.0) # Returns (-1, 0, 1)
+				sign(Vector3(-6.0, 0.0, 6.0)) # Returns (-1, 0, 1)
 				[/codeblock]
+				[b]Note:[/b] For better type safety, use [method signf], [method signi], [method Vector2.sign], [method Vector2i.sign], [method Vector3.sign], [method Vector3i.sign], [method Vector4.sign], or [method Vector4i.sign].
 			</description>
 		</method>
 		<method name="signf">
 			<return type="float" />
 			<param index="0" name="x" type="float" />
 			<description>
-				Returns the sign of [param x] as a [float]: -1.0 or 1.0. Returns 0.0 if [param x] is 0.0.
+				Returns [code]-1.0[/code] if [param x] is negative, [code]1.0[/code] if [param x] is positive, and [code]0.0[/code] if if [param x] is zero.
 				[codeblock]
 				sign(-6.5) # Returns -1.0
 				sign(0.0)  # Returns 0.0
@@ -1024,7 +1026,7 @@
 			<return type="int" />
 			<param index="0" name="x" type="int" />
 			<description>
-				Returns the sign of [param x] as an [int]: -1 or 1. Returns 0 if [param x] is 0.
+				Returns [code]-1[/code] if [param x] is negative, [code]1[/code] if [param x] is positive, and [code]0[/code] if if [param x] is zero.
 				[codeblock]
 				sign(-6) # Returns -1
 				sign(0)  # Returns 0
@@ -1074,16 +1076,46 @@
 			</description>
 		</method>
 		<method name="snapped">
+			<return type="Variant" />
+			<param index="0" name="x" type="Variant" />
+			<param index="1" name="step" type="Variant" />
+			<description>
+				Returns the multiple of [param step] that is the closest to [param x]. This can also be used to round a floating point number to an arbitrary number of decimals.
+				The returned value is the same type of [Variant] as [param step]. Supported types: [int], [float], [Vector2], [Vector2i], [Vector3], [Vector3i], [Vector4], [Vector4i].
+				[codeblock]
+				snapped(100, 32)  # Returns 96
+				snapped(3.14159, 0.01)  # Returns 3.14
+
+				snapped(Vector2(34, 70), Vector2(8, 8))  # Returns (32, 72)
+				[/codeblock]
+				See also [method ceil], [method floor], and [method round].
+				[b]Note:[/b] For better type safety, use [method snappedf], [method snappedi], [method Vector2.snapped], [method Vector2i.snapped], [method Vector3.snapped], [method Vector3i.snapped], [method Vector4.snapped], or [method Vector4i.snapped].
+			</description>
+		</method>
+		<method name="snappedf">
 			<return type="float" />
 			<param index="0" name="x" type="float" />
 			<param index="1" name="step" type="float" />
 			<description>
-				Snaps the float value [param x] to a given [param step]. This can also be used to round a floating point number to an arbitrary number of decimals.
+				Returns the multiple of [param step] that is the closest to [param x]. This can also be used to round a floating point number to an arbitrary number of decimals.
+				A type-safe version of [method snapped], returning a [float].
 				[codeblock]
-				snapped(100, 32) # Returns 96
-				snapped(3.14159, 0.01) # Returns 3.14
+				snapped(32.0, 2.5)  # Returns 32.5
+				snapped(3.14159, 0.01)  # Returns 3.14
+				[/codeblock]
+			</description>
+		</method>
+		<method name="snappedi">
+			<return type="int" />
+			<param index="0" name="x" type="float" />
+			<param index="1" name="step" type="int" />
+			<description>
+				Returns the multiple of [param step] that is the closest to [param x].
+				A type-safe version of [method snapped], returning an [int].
+				[codeblock]
+				snapped(53, 16)  # Returns 48
+				snapped(4096, 100)  # Returns 4100
 				[/codeblock]
-				See also [method ceil], [method floor], and [method round].
 			</description>
 		</method>
 		<method name="sqrt">
diff --git a/doc/classes/EditorInspector.xml b/doc/classes/EditorInspector.xml
index 0cda49f1bf..ab35a62794 100644
--- a/doc/classes/EditorInspector.xml
+++ b/doc/classes/EditorInspector.xml
@@ -7,7 +7,7 @@
 		This is the control that implements property editing in the editor's Settings dialogs, the Inspector dock, etc. To get the [EditorInspector] used in the editor's Inspector dock, use [method EditorInterface.get_inspector].
 		[EditorInspector] will show properties in the same order as the array returned by [method Object.get_property_list].
 		If a property's name is path-like (i.e. if it contains forward slashes), [EditorInspector] will create nested sections for "directories" along the path. For example, if a property is named [code]highlighting/gdscript/node_path_color[/code], it will be shown as "Node Path Color" inside the "GDScript" section nested inside the "Highlighting" section.
-		If a property has [constant PROPERTY_USAGE_GROUP] usage, it will group subsequent properties whose name starts with the property's hint string. The group ends when a property does not start with that hint string or when a new group starts. An empty group name effectively ends the current group. [EditorInspector] will create a top-level section for each group. For example, if a property with group usage is named [code]Collide With[/code] and its hint string is [code]collide_with_[/code], a subsequent [code]collide_with_area[/code] property will be shown as "Area" inside the "Collide With" section.
+		If a property has [constant PROPERTY_USAGE_GROUP] usage, it will group subsequent properties whose name starts with the property's hint string. The group ends when a property does not start with that hint string or when a new group starts. An empty group name effectively ends the current group. [EditorInspector] will create a top-level section for each group. For example, if a property with group usage is named [code]Collide With[/code] and its hint string is [code]collide_with_[/code], a subsequent [code]collide_with_area[/code] property will be shown as "Area" inside the "Collide With" section. There is also a special case: when the hint string contains the name of a property, that property is grouped too. This is mainly to help grouping properties like [code]font[/code], [code]font_color[/code] and [code]font_size[/code] (using the hint string [code]font_[/code]).
 		If a property has [constant PROPERTY_USAGE_SUBGROUP] usage, a subgroup will be created in the same way as a group, and a second-level section will be created for each subgroup.
 		[b]Note:[/b] Unlike sections created from path-like property names, [EditorInspector] won't capitalize the name for sections created from groups. So properties with group usage usually use capitalized names instead of snake_cased names.
 	</description>
diff --git a/doc/classes/Node.xml b/doc/classes/Node.xml
index 21fb9d139c..0368f29a13 100644
--- a/doc/classes/Node.xml
+++ b/doc/classes/Node.xml
@@ -535,7 +535,7 @@
 				[b]Note:[/b] Internal children can only be moved within their expected "internal range" (see [code]internal[/code] parameter in [method add_child]).
 			</description>
 		</method>
-		<method name="print_orphan_nodes">
+		<method name="print_orphan_nodes" qualifiers="static">
 			<return type="void" />
 			<description>
 				Prints all orphan nodes (nodes outside the [SceneTree]). Used for debugging.
diff --git a/doc/classes/Vector2.xml b/doc/classes/Vector2.xml
index fac672c764..4156030d77 100644
--- a/doc/classes/Vector2.xml
+++ b/doc/classes/Vector2.xml
@@ -110,7 +110,7 @@
 			<return type="Vector2" />
 			<param index="0" name="n" type="Vector2" />
 			<description>
-				Returns the vector "bounced off" from a plane defined by the given normal.
+				Returns a new vector "bounced off" from a plane defined by the given normal.
 			</description>
 		</method>
 		<method name="ceil" qualifiers="const">
@@ -287,7 +287,7 @@
 		<method name="normalized" qualifiers="const">
 			<return type="Vector2" />
 			<description>
-				Returns the vector scaled to unit length. Equivalent to [code]v / v.length()[/code].
+				Returns a new vector scaled to unit length. Equivalent to [code]v / v.length()[/code].
 			</description>
 		</method>
 		<method name="orthogonal" qualifiers="const">
@@ -314,21 +314,21 @@
 			<return type="Vector2" />
 			<param index="0" name="b" type="Vector2" />
 			<description>
-				Returns this vector projected onto the vector [code]b[/code].
+				Returns the result of projecting the vector onto the given vector [param b].
 			</description>
 		</method>
 		<method name="reflect" qualifiers="const">
 			<return type="Vector2" />
 			<param index="0" name="n" type="Vector2" />
 			<description>
-				Returns the vector reflected (i.e. mirrored, or symmetric) over a line defined by the given direction vector [param n].
+				Returns the result of reflecting the vector from a line defined by the given direction vector [param n].
 			</description>
 		</method>
 		<method name="rotated" qualifiers="const">
 			<return type="Vector2" />
 			<param index="0" name="angle" type="float" />
 			<description>
-				Returns the vector rotated by [param angle] (in radians). See also [method @GlobalScope.deg_to_rad].
+				Returns the result of rotating this vector by [param angle] (in radians). See also [method @GlobalScope.deg_to_rad].
 			</description>
 		</method>
 		<method name="round" qualifiers="const">
@@ -340,7 +340,7 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector2" />
 			<description>
-				Returns a new vector with each component set to one or negative one, depending on the signs of the components, or zero if the component is zero, by calling [method @GlobalScope.sign] on each component.
+				Returns a new vector with each component set to [code]1.0[/code] if it's positive, [code]-1.0[/code] if it's negative, and [code]0.0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
 			</description>
 		</method>
 		<method name="slerp" qualifiers="const">
@@ -356,14 +356,14 @@
 			<return type="Vector2" />
 			<param index="0" name="n" type="Vector2" />
 			<description>
-				Returns this vector slid along a plane defined by the given normal.
+				Returns the result of sliding the vector along a plane defined by the given normal.
 			</description>
 		</method>
 		<method name="snapped" qualifiers="const">
 			<return type="Vector2" />
 			<param index="0" name="step" type="Vector2" />
 			<description>
-				Returns this vector with each component snapped to the nearest multiple of [param step]. This can also be used to round to an arbitrary number of decimals.
+				Returns a new vector with each component snapped to the nearest multiple of the corresponding component in [param step]. This can also be used to round the components to an arbitrary number of decimals.
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Vector2i.xml b/doc/classes/Vector2i.xml
index fd02e3c530..db6bc8f237 100644
--- a/doc/classes/Vector2i.xml
+++ b/doc/classes/Vector2i.xml
@@ -92,7 +92,14 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector2i" />
 			<description>
-				Returns a new vector with each component set to one or negative one, depending on the signs of the components, or zero if the component is zero, by calling [method @GlobalScope.sign] on each component.
+				Returns a new vector with each component set to [code]1[/code] if it's positive, [code]-1[/code] if it's negative, and [code]0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
+			</description>
+		</method>
+		<method name="snapped" qualifiers="const">
+			<return type="Vector2i" />
+			<param index="0" name="step" type="Vector2i" />
+			<description>
+				Returns a new vector with each component snapped to the closest multiple of the corresponding component in [param step].
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Vector3.xml b/doc/classes/Vector3.xml
index f075915a9c..2896408505 100644
--- a/doc/classes/Vector3.xml
+++ b/doc/classes/Vector3.xml
@@ -298,14 +298,14 @@
 			<return type="Vector3" />
 			<param index="0" name="b" type="Vector3" />
 			<description>
-				Returns this vector projected onto the vector [param b].
+				Returns the result of projecting the vector onto the given vector [param b].
 			</description>
 		</method>
 		<method name="reflect" qualifiers="const">
 			<return type="Vector3" />
 			<param index="0" name="n" type="Vector3" />
 			<description>
-				Returns this vector reflected from a plane defined by the given normal.
+				Returns the result of reflecting the vector from a plane defined by the given normal [param n].
 			</description>
 		</method>
 		<method name="rotated" qualifiers="const">
@@ -313,7 +313,7 @@
 			<param index="0" name="axis" type="Vector3" />
 			<param index="1" name="angle" type="float" />
 			<description>
-				Rotates this vector around a given axis by [param angle] (in radians). The axis must be a normalized vector.
+				Returns the result of rotating this vector around a given axis by [param angle] (in radians). The axis must be a normalized vector. See also [method @GlobalScope.deg_to_rad].
 			</description>
 		</method>
 		<method name="round" qualifiers="const">
@@ -325,7 +325,7 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector3" />
 			<description>
-				Returns a new vector with each component set to one or negative one, depending on the signs of the components, or zero if the component is zero, by calling [method @GlobalScope.sign] on each component.
+				Returns a new vector with each component set to [code]1.0[/code] if it's positive, [code]-1.0[/code] if it's negative, and [code]0.0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
 			</description>
 		</method>
 		<method name="signed_angle_to" qualifiers="const">
@@ -349,14 +349,14 @@
 			<return type="Vector3" />
 			<param index="0" name="n" type="Vector3" />
 			<description>
-				Returns this vector slid along a plane defined by the given normal.
+				Returns a new vector slid along a plane defined by the given normal.
 			</description>
 		</method>
 		<method name="snapped" qualifiers="const">
 			<return type="Vector3" />
 			<param index="0" name="step" type="Vector3" />
 			<description>
-				Returns this vector with each component snapped to the nearest multiple of [param step]. This can also be used to round to an arbitrary number of decimals.
+				Returns a new vector with each component snapped to the nearest multiple of the corresponding component in [param step]. This can also be used to round the components to an arbitrary number of decimals.
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Vector3i.xml b/doc/classes/Vector3i.xml
index eb64e098ce..5c6dc3c1c5 100644
--- a/doc/classes/Vector3i.xml
+++ b/doc/classes/Vector3i.xml
@@ -87,7 +87,14 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector3i" />
 			<description>
-				Returns the vector with each component set to one or negative one, depending on the signs of the components.
+				Returns a new vector with each component set to [code]1[/code] if it's positive, [code]-1[/code] if it's negative, and [code]0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
+			</description>
+		</method>
+		<method name="snapped" qualifiers="const">
+			<return type="Vector3i" />
+			<param index="0" name="step" type="Vector3i" />
+			<description>
+				Returns a new vector with each component snapped to the closest multiple of the corresponding component in [param step].
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Vector4.xml b/doc/classes/Vector4.xml
index a2759937c1..d15ae35b59 100644
--- a/doc/classes/Vector4.xml
+++ b/doc/classes/Vector4.xml
@@ -190,21 +190,21 @@
 		<method name="normalized" qualifiers="const">
 			<return type="Vector4" />
 			<description>
-				Returns the vector scaled to unit length. Equivalent to [code]v / v.length()[/code].
+				Returns the result of scaling the vector to unit length. Equivalent to [code]v / v.length()[/code].
 			</description>
 		</method>
 		<method name="posmod" qualifiers="const">
 			<return type="Vector4" />
 			<param index="0" name="mod" type="float" />
 			<description>
-				Returns a vector composed of the [method @GlobalScope.fposmod] of this vector's components and [param mod].
+				Returns a new vector composed of the [method @GlobalScope.fposmod] of this vector's components and [param mod].
 			</description>
 		</method>
 		<method name="posmodv" qualifiers="const">
 			<return type="Vector4" />
 			<param index="0" name="modv" type="Vector4" />
 			<description>
-				Returns a vector composed of the [method @GlobalScope.fposmod] of this vector's components and [param modv]'s components.
+				Returns a new vector composed of the [method @GlobalScope.fposmod] of this vector's components and [param modv]'s components.
 			</description>
 		</method>
 		<method name="round" qualifiers="const">
@@ -216,14 +216,14 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector4" />
 			<description>
-				Returns a new vector with each component set to one or negative one, depending on the signs of the components, or zero if the component is zero, by calling [method @GlobalScope.sign] on each component.
+				Returns a new vector with each component set to [code]1.0[/code] if it's positive, [code]-1.0[/code] if it's negative, and [code]0.0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
 			</description>
 		</method>
 		<method name="snapped" qualifiers="const">
 			<return type="Vector4" />
 			<param index="0" name="step" type="Vector4" />
 			<description>
-				Returns this vector with each component snapped to the nearest multiple of [param step]. This can also be used to round to an arbitrary number of decimals.
+				Returns a new vector with each component snapped to the nearest multiple of the corresponding component in [param step]. This can also be used to round the components to an arbitrary number of decimals.
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Vector4i.xml b/doc/classes/Vector4i.xml
index 2dc10234ee..95797df90a 100644
--- a/doc/classes/Vector4i.xml
+++ b/doc/classes/Vector4i.xml
@@ -83,7 +83,14 @@
 		<method name="sign" qualifiers="const">
 			<return type="Vector4i" />
 			<description>
-				Returns a new vector with each component set to one or negative one, depending on the signs of the components, or zero if the component is zero, by calling [method @GlobalScope.sign] on each component.
+				Returns a new vector with each component set to [code]1[/code] if it's positive, [code]-1[/code] if it's negative, and [code]0[/code] if it's zero. The result is identical to calling [method @GlobalScope.sign] on each component.
+			</description>
+		</method>
+		<method name="snapped" qualifiers="const">
+			<return type="Vector4i" />
+			<param index="0" name="step" type="Vector4i" />
+			<description>
+				Returns a new vector with each component snapped to the closest multiple of the corresponding component in [param step].
 			</description>
 		</method>
 	</methods>
diff --git a/drivers/gles3/rasterizer_canvas_gles3.cpp b/drivers/gles3/rasterizer_canvas_gles3.cpp
index 0c102bfc1d..07d56b156c 100644
--- a/drivers/gles3/rasterizer_canvas_gles3.cpp
+++ b/drivers/gles3/rasterizer_canvas_gles3.cpp
@@ -1306,10 +1306,6 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 				instance_uses_custom_data = true;
 			}
 
-			if (instance_buffer == 0) {
-				break;
-			}
-
 			ERR_FAIL_COND(mesh.is_null());
 
 			uint32_t surf_count = mesh_storage->mesh_get_surface_count(mesh);
@@ -1339,6 +1335,9 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
 				}
 
 				if (instance_count > 1) {
+					if (instance_buffer == 0) {
+						break;
+					}
 					// Bind instance buffers.
 					glBindBuffer(GL_ARRAY_BUFFER, instance_buffer);
 					glEnableVertexAttribArray(1);
diff --git a/editor/animation_bezier_editor.cpp b/editor/animation_bezier_editor.cpp
index da75bf1f3b..a7a07a41ac 100644
--- a/editor/animation_bezier_editor.cpp
+++ b/editor/animation_bezier_editor.cpp
@@ -655,10 +655,6 @@ Size2 AnimationBezierTrackEdit::get_minimum_size() const {
 	return Vector2(1, 1);
 }
 
-void AnimationBezierTrackEdit::set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo) {
-	undo_redo = p_undo_redo;
-}
-
 void AnimationBezierTrackEdit::set_timeline(AnimationTimelineEdit *p_timeline) {
 	timeline = p_timeline;
 	timeline->connect("zoom_changed", callable_mp(this, &AnimationBezierTrackEdit::_zoom_changed));
@@ -791,6 +787,7 @@ void AnimationBezierTrackEdit::_clear_selection() {
 }
 
 void AnimationBezierTrackEdit::_change_selected_keys_handle_mode(Animation::HandleMode p_mode, bool p_auto) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Update Selected Key Handles"));
 	for (SelectionSet::Element *E = selection.back(); E; E = E->prev()) {
 		const IntPair track_key_pair = E->get();
@@ -987,6 +984,7 @@ void AnimationBezierTrackEdit::gui_input(const Ref<InputEvent> &p_event) {
 				if (I.value.has_point(mb->get_position())) {
 					if (I.key == REMOVE_ICON) {
 						if (!read_only) {
+							Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 							undo_redo->create_action("Remove Bezier Track");
 
 							undo_redo->add_do_method(this, "_update_locked_tracks_after", track);
@@ -1173,6 +1171,7 @@ void AnimationBezierTrackEdit::gui_input(const Ref<InputEvent> &p_event) {
 				time += 0.001;
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Bezier Point"));
 			undo_redo->add_do_method(animation.ptr(), "bezier_track_insert_key", selected_track, time, new_point);
 			undo_redo->add_undo_method(animation.ptr(), "track_remove_key_at_time", selected_track, time);
@@ -1270,6 +1269,7 @@ void AnimationBezierTrackEdit::gui_input(const Ref<InputEvent> &p_event) {
 			if (moving_selection) {
 				//combit it
 
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Move Bezier Points"));
 
 				List<AnimMoveRestore> to_restore;
@@ -1470,6 +1470,7 @@ void AnimationBezierTrackEdit::gui_input(const Ref<InputEvent> &p_event) {
 
 	if ((moving_handle == -1 || moving_handle == 1) && mb.is_valid() && !mb->is_pressed() && mb->get_button_index() == MouseButton::LEFT) {
 		if (!read_only) {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Move Bezier Points"));
 			if (moving_handle == -1) {
 				real_t ratio = timeline->get_zoom_scale() * v_zoom;
@@ -1541,6 +1542,7 @@ void AnimationBezierTrackEdit::_menu_selected(int p_index) {
 					time += 0.001;
 				}
 
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Add Bezier Point"));
 				undo_redo->add_do_method(animation.ptr(), "track_insert_key", selected_track, time, new_point);
 				undo_redo->add_undo_method(animation.ptr(), "track_remove_key_at_time", selected_track, time);
@@ -1588,6 +1590,7 @@ void AnimationBezierTrackEdit::duplicate_selection() {
 		}
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Anim Duplicate Keys"));
 
 	List<Pair<int, real_t>> new_selection_values;
@@ -1633,6 +1636,7 @@ void AnimationBezierTrackEdit::duplicate_selection() {
 
 void AnimationBezierTrackEdit::delete_selection() {
 	if (selection.size()) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Anim Delete Keys"));
 
 		for (SelectionSet::Element *E = selection.back(); E; E = E->prev()) {
diff --git a/editor/animation_bezier_editor.h b/editor/animation_bezier_editor.h
index beb7a5e9c6..8f03787429 100644
--- a/editor/animation_bezier_editor.h
+++ b/editor/animation_bezier_editor.h
@@ -34,7 +34,6 @@
 #include "animation_track_editor.h"
 #include "core/templates/hashfuncs.h"
 
-class EditorUndoRedoManager;
 class ViewPanner;
 
 class AnimationBezierTrackEdit : public Control {
@@ -53,7 +52,6 @@ class AnimationBezierTrackEdit : public Control {
 	};
 
 	AnimationTimelineEdit *timeline = nullptr;
-	Ref<EditorUndoRedoManager> undo_redo;
 	Node *root = nullptr;
 	Control *play_position = nullptr; //separate control used to draw so updates for only position changed are much faster
 	real_t play_position_pos = 0;
@@ -197,7 +195,6 @@ public:
 	void set_animation_and_track(const Ref<Animation> &p_animation, int p_track, bool p_read_only);
 	virtual Size2 get_minimum_size() const override;
 
-	void set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo);
 	void set_timeline(AnimationTimelineEdit *p_timeline);
 	void set_editor(AnimationTrackEditor *p_editor);
 	void set_root(Node *p_root);
diff --git a/editor/animation_track_editor.cpp b/editor/animation_track_editor.cpp
index 8305baf0a1..7aa06e6398 100644
--- a/editor/animation_track_editor.cpp
+++ b/editor/animation_track_editor.cpp
@@ -133,6 +133,7 @@ public:
 			int existing = animation->track_find_key(track, new_time, true);
 
 			setting = true;
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Anim Change Keyframe Time"), UndoRedo::MERGE_ENDS);
 
 			Variant val = animation->track_get_key_value(track, key);
@@ -160,6 +161,7 @@ public:
 			float val = p_value;
 			float prev_val = animation->track_get_key_transition(track, key);
 			setting = true;
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Anim Change Transition"), UndoRedo::MERGE_ENDS);
 			undo_redo->add_do_method(animation.ptr(), "track_set_key_transition", track, key, val);
 			undo_redo->add_undo_method(animation.ptr(), "track_set_key_transition", track, key, prev_val);
@@ -171,6 +173,7 @@ public:
 			return true;
 		}
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		switch (animation->track_get_type(track)) {
 			case Animation::TYPE_POSITION_3D:
 			case Animation::TYPE_ROTATION_3D:
@@ -685,7 +688,6 @@ public:
 		}
 	}
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	Ref<Animation> animation;
 	int track = -1;
 	float key_ofs = 0;
@@ -810,6 +812,7 @@ public:
 
 					int existing = animation->track_find_key(track, new_time, true);
 
+					Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 					if (!setting) {
 						setting = true;
 						undo_redo->create_action(TTR("Anim Multi Change Keyframe Time"), UndoRedo::MERGE_ENDS);
@@ -834,6 +837,7 @@ public:
 					float val = p_value;
 					float prev_val = animation->track_get_key_transition(track, key);
 
+					Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 					if (!setting) {
 						setting = true;
 						undo_redo->create_action(TTR("Anim Multi Change Transition"), UndoRedo::MERGE_ENDS);
@@ -843,6 +847,7 @@ public:
 					update_obj = true;
 				}
 
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				switch (animation->track_get_type(track)) {
 					case Animation::TYPE_POSITION_3D:
 					case Animation::TYPE_ROTATION_3D:
@@ -1053,6 +1058,7 @@ public:
 			}
 		}
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		if (setting) {
 			if (update_obj) {
 				undo_redo->add_do_method(this, "_update_obj", animation);
@@ -1376,8 +1382,6 @@ public:
 
 	bool use_fps = false;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	void notify_change() {
 		notify_property_list_changed();
 	}
@@ -1419,6 +1423,7 @@ void AnimationTimelineEdit::_anim_length_changed(double p_new_len) {
 	}
 
 	editing = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Animation Length"));
 	undo_redo->add_do_method(animation.ptr(), "set_length", p_new_len);
 	undo_redo->add_undo_method(animation.ptr(), "set_length", animation->get_length());
@@ -1431,6 +1436,7 @@ void AnimationTimelineEdit::_anim_length_changed(double p_new_len) {
 
 void AnimationTimelineEdit::_anim_loop_pressed() {
 	if (!read_only) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Change Animation Loop"));
 		switch (animation->get_loop_mode()) {
 			case Animation::LOOP_NONE: {
@@ -1712,10 +1718,6 @@ Size2 AnimationTimelineEdit::get_minimum_size() const {
 	return ms;
 }
 
-void AnimationTimelineEdit::set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo) {
-	undo_redo = p_undo_redo;
-}
-
 void AnimationTimelineEdit::set_zoom(Range *p_zoom) {
 	zoom = p_zoom;
 	zoom->connect("value_changed", callable_mp(this, &AnimationTimelineEdit::_zoom_changed));
@@ -2514,14 +2516,6 @@ Size2 AnimationTrackEdit::get_minimum_size() const {
 	return Vector2(1, max_h + separation);
 }
 
-void AnimationTrackEdit::set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo) {
-	undo_redo = p_undo_redo;
-}
-
-Ref<EditorUndoRedoManager> AnimationTrackEdit::get_undo_redo() const {
-	return undo_redo;
-}
-
 void AnimationTrackEdit::set_timeline(AnimationTimelineEdit *p_timeline) {
 	timeline = p_timeline;
 	timeline->set_track_edit(this);
@@ -2568,6 +2562,7 @@ void AnimationTrackEdit::_zoom_changed() {
 }
 
 void AnimationTrackEdit::_path_submitted(const String &p_text) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Track Path"));
 	undo_redo->add_do_method(animation.ptr(), "track_set_path", track, p_text);
 	undo_redo->add_undo_method(animation.ptr(), "track_set_path", track, animation->track_get_path(track));
@@ -2805,6 +2800,7 @@ void AnimationTrackEdit::gui_input(const Ref<InputEvent> &p_event) {
 
 		if (!read_only) {
 			if (check_rect.has_point(pos)) {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Toggle Track Enabled"));
 				undo_redo->add_do_method(animation.ptr(), "track_set_enabled", track, !animation->track_is_enabled(track));
 				undo_redo->add_undo_method(animation.ptr(), "track_set_enabled", track, animation->track_is_enabled(track));
@@ -3196,6 +3192,7 @@ void AnimationTrackEdit::_menu_selected(int p_index) {
 		case MENU_CALL_MODE_TRIGGER:
 		case MENU_CALL_MODE_CAPTURE: {
 			Animation::UpdateMode update_mode = Animation::UpdateMode(p_index);
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Change Animation Update Mode"));
 			undo_redo->add_do_method(animation.ptr(), "value_track_set_update_mode", track, update_mode);
 			undo_redo->add_undo_method(animation.ptr(), "value_track_set_update_mode", track, animation->value_track_get_update_mode(track));
@@ -3209,6 +3206,7 @@ void AnimationTrackEdit::_menu_selected(int p_index) {
 		case MENU_INTERPOLATION_LINEAR_ANGLE:
 		case MENU_INTERPOLATION_CUBIC_ANGLE: {
 			Animation::InterpolationType interp_mode = Animation::InterpolationType(p_index - MENU_INTERPOLATION_NEAREST);
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Change Animation Interpolation Mode"));
 			undo_redo->add_do_method(animation.ptr(), "track_set_interpolation_type", track, interp_mode);
 			undo_redo->add_undo_method(animation.ptr(), "track_set_interpolation_type", track, animation->track_get_interpolation_type(track));
@@ -3218,6 +3216,7 @@ void AnimationTrackEdit::_menu_selected(int p_index) {
 		case MENU_LOOP_WRAP:
 		case MENU_LOOP_CLAMP: {
 			bool loop_wrap = p_index == MENU_LOOP_WRAP;
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Change Animation Loop Mode"));
 			undo_redo->add_do_method(animation.ptr(), "track_set_interpolation_loop_wrap", track, loop_wrap);
 			undo_redo->add_undo_method(animation.ptr(), "track_set_interpolation_loop_wrap", track, animation->track_get_interpolation_loop_wrap(track));
@@ -3457,7 +3456,7 @@ void AnimationTrackEditor::set_animation(const Ref<Animation> &p_anim, bool p_re
 	_update_tracks();
 
 	if (animation.is_valid()) {
-		animation->connect("changed", callable_mp(this, &AnimationTrackEditor::_animation_changed), CONNECT_DEFERRED);
+		animation->connect("changed", callable_mp(this, &AnimationTrackEditor::_animation_changed));
 
 		hscroll->show();
 		edit->set_disabled(read_only);
@@ -3611,6 +3610,7 @@ void AnimationTrackEditor::_animation_track_remove_request(int p_track, Ref<Anim
 	}
 	int idx = p_track;
 	if (idx >= 0 && idx < p_from_animation->get_track_count()) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Remove Anim Track"), UndoRedo::MERGE_DISABLE, p_from_animation.ptr());
 
 		// Remove corresponding reset tracks if they are no longer needed.
@@ -3811,6 +3811,7 @@ void AnimationTrackEditor::_query_insert(const InsertData &p_id) {
 }
 
 void AnimationTrackEditor::_insert_track(bool p_reset_wanted, bool p_create_beziers) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Anim Insert"));
 
 	Ref<Animation> reset_anim;
@@ -4141,6 +4142,7 @@ Ref<Animation> AnimationTrackEditor::_create_and_get_reset_animation() {
 		Ref<Animation> reset_anim;
 		reset_anim.instantiate();
 		reset_anim->set_length(ANIM_MIN_LENGTH);
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->add_do_method(al.ptr(), "add_animation", SceneStringNames::get_singleton()->RESET, reset_anim);
 		undo_redo->add_do_method(AnimationPlayerEditor::get_singleton(), "_animation_player_changed", player);
 		undo_redo->add_undo_method(al.ptr(), "remove_animation", SceneStringNames::get_singleton()->RESET);
@@ -4150,6 +4152,7 @@ Ref<Animation> AnimationTrackEditor::_create_and_get_reset_animation() {
 }
 
 void AnimationTrackEditor::_confirm_insert_list() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Anim Create & Insert"));
 
 	bool create_reset = insert_confirm_reset->is_visible() && insert_confirm_reset->is_pressed();
@@ -4323,6 +4326,7 @@ AnimationTrackEditor::TrackIndices AnimationTrackEditor::_confirm_insert(InsertD
 		}
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (create_normal_track) {
 		if (p_create_beziers) {
 			bool valid;
@@ -4607,7 +4611,6 @@ void AnimationTrackEditor::_update_tracks() {
 			track_vbox->add_child(track_edit);
 		}
 
-		track_edit->set_undo_redo(undo_redo);
 		track_edit->set_timeline(timeline);
 		track_edit->set_root(root);
 		track_edit->set_animation_and_track(animation, i, file_read_only);
@@ -4654,19 +4657,19 @@ void AnimationTrackEditor::_animation_changed() {
 	}
 
 	if (key_edit) {
-		_update_key_edit();
-	}
-
-	if (key_edit && key_edit->setting) {
-		// If editing a key, just redraw the edited track, makes refresh less costly.
-		if (key_edit->track < track_edits.size()) {
-			if (animation->track_get_type(key_edit->track) == Animation::TYPE_BEZIER) {
-				bezier_edit->queue_redraw();
-			} else {
-				track_edits[key_edit->track]->queue_redraw();
+		if (key_edit->setting) {
+			// If editing a key, just redraw the edited track, makes refresh less costly.
+			if (key_edit->track < track_edits.size()) {
+				if (animation->track_get_type(key_edit->track) == Animation::TYPE_BEZIER) {
+					bezier_edit->queue_redraw();
+				} else {
+					track_edits[key_edit->track]->queue_redraw();
+				}
 			}
+			return;
+		} else {
+			_update_key_edit();
 		}
-		return;
 	}
 
 	animation_changing_awaiting_update = true;
@@ -4781,6 +4784,7 @@ void AnimationTrackEditor::_update_scroll(double) {
 }
 
 void AnimationTrackEditor::_update_step(double p_new_step) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Animation Step"));
 	float step_value = p_new_step;
 	if (timeline->is_using_fps()) {
@@ -4807,6 +4811,7 @@ void AnimationTrackEditor::_dropped_track(int p_from_track, int p_to_track) {
 	}
 
 	_clear_selection(true);
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Rearrange Tracks"));
 	undo_redo->add_do_method(animation.ptr(), "track_move_to", p_from_track, p_to_track);
 	// Take into account that the position of the tracks that come after the one removed will change.
@@ -4850,6 +4855,7 @@ void AnimationTrackEditor::_new_track_node_selected(NodePath p_path) {
 		case Animation::TYPE_ROTATION_3D:
 		case Animation::TYPE_SCALE_3D:
 		case Animation::TYPE_METHOD: {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Track"));
 			undo_redo->add_do_method(animation.ptr(), "add_track", adding_track_type);
 			undo_redo->add_do_method(animation.ptr(), "track_set_path", animation->get_track_count(), path_to);
@@ -4878,6 +4884,7 @@ void AnimationTrackEditor::_new_track_node_selected(NodePath p_path) {
 				return;
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Track"));
 			undo_redo->add_do_method(animation.ptr(), "add_track", adding_track_type);
 			undo_redo->add_do_method(animation.ptr(), "track_set_path", animation->get_track_count(), path_to);
@@ -4896,6 +4903,7 @@ void AnimationTrackEditor::_new_track_node_selected(NodePath p_path) {
 				return;
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Track"));
 			undo_redo->add_do_method(animation.ptr(), "add_track", adding_track_type);
 			undo_redo->add_do_method(animation.ptr(), "track_set_path", animation->get_track_count(), path_to);
@@ -4920,6 +4928,7 @@ void AnimationTrackEditor::_add_track(int p_type) {
 void AnimationTrackEditor::_new_track_property_selected(String p_name) {
 	String full_path = String(adding_track_path) + ":" + p_name;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (adding_track_type == Animation::TYPE_VALUE) {
 		Animation::UpdateMode update_mode = Animation::UPDATE_DISCRETE;
 		{
@@ -5014,6 +5023,7 @@ void AnimationTrackEditor::_insert_key_from_track(float p_ofs, int p_track) {
 		p_ofs += 0.001;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	switch (animation->track_get_type(p_track)) {
 		case Animation::TYPE_POSITION_3D: {
 			if (!root->has_node(animation->track_get_path(p_track))) {
@@ -5169,6 +5179,7 @@ void AnimationTrackEditor::_add_method_key(const String &p_method) {
 			}
 			d["args"] = params;
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Method Track Key"));
 			undo_redo->add_do_method(animation.ptr(), "track_insert_key", insert_key_from_track_call_track, insert_key_from_track_call_ofs, d);
 			undo_redo->add_undo_method(animation.ptr(), "track_remove_key_at_time", insert_key_from_track_call_track, insert_key_from_track_call_ofs);
@@ -5278,13 +5289,17 @@ void AnimationTrackEditor::_update_key_edit() {
 		key_edit->track = selection.front()->key().track;
 		key_edit->use_fps = timeline->is_using_fps();
 
-		float ofs = animation->track_get_key_time(key_edit->track, selection.front()->key().key);
+		int key_id = selection.front()->key().key;
+		if (key_id >= animation->track_get_key_count(key_edit->track)) {
+			_clear_key_edit();
+			return; // Probably in the process of rearranging the keys.
+		}
+		float ofs = animation->track_get_key_time(key_edit->track, key_id);
 		key_edit->key_ofs = ofs;
 		key_edit->root_path = root;
 
 		NodePath np;
 		key_edit->hint = _find_hint_for_track(key_edit->track, np);
-		key_edit->undo_redo = undo_redo;
 		key_edit->base = np;
 
 		EditorNode::get_singleton()->push_item(key_edit);
@@ -5307,18 +5322,19 @@ void AnimationTrackEditor::_update_key_edit() {
 				base_map[track] = NodePath();
 			}
 
+			int key_id = E.key.key;
+			if (key_id >= animation->track_get_key_count(track)) {
+				_clear_key_edit();
+				return; // Probably in the process of rearranging the keys.
+			}
 			key_ofs_map[track].push_back(animation->track_get_key_time(track, E.key.key));
 		}
 		multi_key_edit->key_ofs_map = key_ofs_map;
 		multi_key_edit->base_map = base_map;
 		multi_key_edit->hint = _find_hint_for_track(first_track, base_map[first_track]);
-
 		multi_key_edit->use_fps = timeline->is_using_fps();
-
 		multi_key_edit->root_path = root;
 
-		multi_key_edit->undo_redo = undo_redo;
-
 		EditorNode::get_singleton()->push_item(multi_key_edit);
 	}
 }
@@ -5346,9 +5362,11 @@ void AnimationTrackEditor::_select_at_anim(const Ref<Animation> &p_anim, int p_t
 	ki.pos = p_pos;
 
 	selection.insert(sk, ki);
+	_update_key_edit();
 }
 
 void AnimationTrackEditor::_move_selection_commit() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Anim Move Keys"));
 
 	List<_AnimMoveRestore> to_restore;
@@ -5421,7 +5439,6 @@ void AnimationTrackEditor::_move_selection_commit() {
 	undo_redo->add_do_method(this, "_redraw_tracks");
 	undo_redo->add_undo_method(this, "_redraw_tracks");
 	undo_redo->commit_action();
-	_update_key_edit();
 }
 
 void AnimationTrackEditor::_move_selection_cancel() {
@@ -5601,6 +5618,7 @@ void AnimationTrackEditor::_anim_duplicate_keys(bool transpose) {
 
 		int start_track = transpose ? _get_track_selected() : top_track;
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Anim Duplicate Keys"));
 
 		List<Pair<int, float>> new_selection_values;
@@ -5651,7 +5669,6 @@ void AnimationTrackEditor::_anim_duplicate_keys(bool transpose) {
 		undo_redo->add_do_method(this, "_redraw_tracks");
 		undo_redo->add_undo_method(this, "_redraw_tracks");
 		undo_redo->commit_action();
-		_update_key_edit();
 	}
 }
 
@@ -5831,6 +5848,7 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			}
 
 			int base_track = animation->get_track_count();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Paste Tracks"));
 			for (int i = 0; i < track_clipboard.size(); i++) {
 				undo_redo->add_do_method(animation.ptr(), "add_track", track_clipboard[i].track_type);
@@ -5898,10 +5916,9 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			}
 
 			float s = scale->get_value();
-			if (s == 0) {
-				ERR_PRINT("Can't scale to 0");
-			}
+			ERR_FAIL_COND_MSG(s == 0, "Can't scale to 0.");
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Anim Scale Keys"));
 
 			List<_AnimMoveRestore> to_restore;
@@ -5975,14 +5992,13 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			undo_redo->add_do_method(this, "_redraw_tracks");
 			undo_redo->add_undo_method(this, "_redraw_tracks");
 			undo_redo->commit_action();
-			_update_key_edit();
-
 		} break;
 
 		case EDIT_EASE_SELECTION: {
 			ease_dialog->popup_centered(Size2(200, 100) * EDSCALE);
 		} break;
 		case EDIT_EASE_CONFIRM: {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Make Easing Keys"));
 
 			Tween::TransitionType transition_type = static_cast<Tween::TransitionType>(transition_selection->get_selected_id());
@@ -6089,6 +6105,7 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			_anim_duplicate_keys(true);
 		} break;
 		case EDIT_ADD_RESET_KEY: {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Anim Add RESET Keys"));
 
 			Ref<Animation> reset = _create_and_get_reset_animation();
@@ -6149,6 +6166,7 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			}
 
 			if (selection.size()) {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Anim Delete Keys"));
 
 				for (RBMap<SelectedKey, KeyInfo>::Element *E = selection.back(); E; E = E->prev()) {
@@ -6179,6 +6197,7 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			bake_dialog->popup_centered(Size2(200, 100) * EDSCALE);
 		} break;
 		case EDIT_BAKE_ANIMATION_CONFIRM: {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Bake Animation as Linear keys."));
 
 			int track_len = animation->get_track_count();
@@ -6299,6 +6318,7 @@ void AnimationTrackEditor::_edit_menu_pressed(int p_option) {
 			animation->optimize(optimize_velocity_error->get_value(), optimize_angular_error->get_value(), optimize_precision_error->get_value());
 			_redraw_tracks();
 			_update_key_edit();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->clear_history(true, undo_redo->get_history_id_for_object(animation.ptr()));
 			undo_redo->clear_history(true, undo_redo->get_history_id_for_object(this));
 
@@ -6368,6 +6388,7 @@ void AnimationTrackEditor::_cleanup_animation(Ref<Animation> p_animation) {
 		}
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->clear_history(true, undo_redo->get_history_id_for_object(animation.ptr()));
 	undo_redo->clear_history(true, undo_redo->get_history_id_for_object(this));
 	_update_tracks();
@@ -6533,8 +6554,6 @@ void AnimationTrackEditor::_pick_track_filter_input(const Ref<InputEvent> &p_ie)
 }
 
 AnimationTrackEditor::AnimationTrackEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	main_panel = memnew(PanelContainer);
 	main_panel->set_focus_mode(FOCUS_ALL); // Allow panel to have focus so that shortcuts work as expected.
 	add_child(main_panel);
@@ -6559,7 +6578,6 @@ AnimationTrackEditor::AnimationTrackEditor() {
 	main_panel->add_child(info_message);
 
 	timeline = memnew(AnimationTimelineEdit);
-	timeline->set_undo_redo(undo_redo);
 	timeline_vbox->add_child(timeline);
 	timeline->connect("timeline_changed", callable_mp(this, &AnimationTrackEditor::_timeline_changed));
 	timeline->connect("name_limit_changed", callable_mp(this, &AnimationTrackEditor::_name_limit_changed));
@@ -6582,7 +6600,6 @@ AnimationTrackEditor::AnimationTrackEditor() {
 
 	bezier_edit = memnew(AnimationBezierTrackEdit);
 	timeline_vbox->add_child(bezier_edit);
-	bezier_edit->set_undo_redo(undo_redo);
 	bezier_edit->set_editor(this);
 	bezier_edit->set_timeline(timeline);
 	bezier_edit->hide();
diff --git a/editor/animation_track_editor.h b/editor/animation_track_editor.h
index db2f8b32dc..dc0c4abe5f 100644
--- a/editor/animation_track_editor.h
+++ b/editor/animation_track_editor.h
@@ -79,7 +79,6 @@ class AnimationTimelineEdit : public Range {
 	void _anim_loop_pressed();
 
 	void _play_position_draw();
-	Ref<EditorUndoRedoManager> undo_redo;
 	Rect2 hsize_rect;
 
 	bool editing = false;
@@ -113,7 +112,6 @@ public:
 	void set_track_edit(AnimationTrackEdit *p_track_edit);
 	void set_zoom(Range *p_zoom);
 	Range *get_zoom() const { return zoom; }
-	void set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo);
 
 	void set_play_position(float p_pos);
 	float get_play_position() const;
@@ -156,7 +154,6 @@ class AnimationTrackEdit : public Control {
 	};
 
 	AnimationTimelineEdit *timeline = nullptr;
-	Ref<EditorUndoRedoManager> undo_redo;
 	Popup *path_popup = nullptr;
 	LineEdit *path = nullptr;
 	Node *root = nullptr;
@@ -237,12 +234,10 @@ public:
 	Ref<Animation> get_animation() const;
 	AnimationTimelineEdit *get_timeline() const { return timeline; }
 	AnimationTrackEditor *get_editor() const { return editor; }
-	Ref<EditorUndoRedoManager> get_undo_redo() const;
 	NodePath get_path() const;
 	void set_animation_and_track(const Ref<Animation> &p_animation, int p_track, bool p_read_only);
 	virtual Size2 get_minimum_size() const override;
 
-	void set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo);
 	void set_timeline(AnimationTimelineEdit *p_timeline);
 	void set_editor(AnimationTrackEditor *p_editor);
 	void set_root(Node *p_root);
@@ -339,8 +334,6 @@ class AnimationTrackEditor : public VBoxContainer {
 	void _animation_track_remove_request(int p_track, Ref<Animation> p_from_animation);
 	void _track_grab_focus(int p_track);
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	void _update_scroll(double);
 	void _update_step(double p_new_step);
 	void _update_length(double p_new_len);
@@ -434,7 +427,6 @@ class AnimationTrackEditor : public VBoxContainer {
 	AnimationTrackKeyEdit *key_edit = nullptr;
 	AnimationMultiTrackKeyEdit *multi_key_edit = nullptr;
 	void _update_key_edit();
-
 	void _clear_key_edit();
 
 	Control *box_selection = nullptr;
diff --git a/editor/animation_track_editor_plugins.cpp b/editor/animation_track_editor_plugins.cpp
index 971b671a0c..a5f6f449a6 100644
--- a/editor/animation_track_editor_plugins.cpp
+++ b/editor/animation_track_editor_plugins.cpp
@@ -31,6 +31,7 @@
 #include "animation_track_editor_plugins.h"
 
 #include "editor/audio_stream_preview.h"
+#include "editor/editor_node.h"
 #include "editor/editor_resource_preview.h"
 #include "editor/editor_scale.h"
 #include "editor/editor_undo_redo_manager.h"
@@ -1021,10 +1022,11 @@ void AnimationTrackEditTypeAudio::drop_data(const Point2 &p_point, const Variant
 				ofs += 0.001;
 			}
 
-			get_undo_redo()->create_action(TTR("Add Audio Track Clip"));
-			get_undo_redo()->add_do_method(get_animation().ptr(), "audio_track_insert_key", get_track(), ofs, stream);
-			get_undo_redo()->add_undo_method(get_animation().ptr(), "track_remove_key_at_time", get_track(), ofs);
-			get_undo_redo()->commit_action();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+			undo_redo->create_action(TTR("Add Audio Track Clip"));
+			undo_redo->add_do_method(get_animation().ptr(), "audio_track_insert_key", get_track(), ofs, stream);
+			undo_redo->add_undo_method(get_animation().ptr(), "track_remove_key_at_time", get_track(), ofs);
+			undo_redo->commit_action();
 
 			queue_redraw();
 			return;
@@ -1102,21 +1104,22 @@ void AnimationTrackEditTypeAudio::gui_input(const Ref<InputEvent> &p_event) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (len_resizing && mb.is_valid() && !mb->is_pressed() && mb->get_button_index() == MouseButton::LEFT) {
 		float ofs_local = -len_resizing_rel / get_timeline()->get_zoom_scale();
 		if (len_resizing_start) {
 			float prev_ofs = get_animation()->audio_track_get_key_start_offset(get_track(), len_resizing_index);
-			get_undo_redo()->create_action(TTR("Change Audio Track Clip Start Offset"));
-			get_undo_redo()->add_do_method(get_animation().ptr(), "audio_track_set_key_start_offset", get_track(), len_resizing_index, prev_ofs + ofs_local);
-			get_undo_redo()->add_undo_method(get_animation().ptr(), "audio_track_set_key_start_offset", get_track(), len_resizing_index, prev_ofs);
-			get_undo_redo()->commit_action();
+			undo_redo->create_action(TTR("Change Audio Track Clip Start Offset"));
+			undo_redo->add_do_method(get_animation().ptr(), "audio_track_set_key_start_offset", get_track(), len_resizing_index, prev_ofs + ofs_local);
+			undo_redo->add_undo_method(get_animation().ptr(), "audio_track_set_key_start_offset", get_track(), len_resizing_index, prev_ofs);
+			undo_redo->commit_action();
 
 		} else {
 			float prev_ofs = get_animation()->audio_track_get_key_end_offset(get_track(), len_resizing_index);
-			get_undo_redo()->create_action(TTR("Change Audio Track Clip End Offset"));
-			get_undo_redo()->add_do_method(get_animation().ptr(), "audio_track_set_key_end_offset", get_track(), len_resizing_index, prev_ofs + ofs_local);
-			get_undo_redo()->add_undo_method(get_animation().ptr(), "audio_track_set_key_end_offset", get_track(), len_resizing_index, prev_ofs);
-			get_undo_redo()->commit_action();
+			undo_redo->create_action(TTR("Change Audio Track Clip End Offset"));
+			undo_redo->add_do_method(get_animation().ptr(), "audio_track_set_key_end_offset", get_track(), len_resizing_index, prev_ofs + ofs_local);
+			undo_redo->add_undo_method(get_animation().ptr(), "audio_track_set_key_end_offset", get_track(), len_resizing_index, prev_ofs);
+			undo_redo->commit_action();
 		}
 
 		len_resizing_index = -1;
diff --git a/editor/debugger/editor_debugger_node.cpp b/editor/debugger/editor_debugger_node.cpp
index 150257a95c..1ec94aeae4 100644
--- a/editor/debugger/editor_debugger_node.cpp
+++ b/editor/debugger/editor_debugger_node.cpp
@@ -207,9 +207,32 @@ String EditorDebuggerNode::get_server_uri() const {
 	return server->get_uri();
 }
 
+void EditorDebuggerNode::set_keep_open(bool p_keep_open) {
+	keep_open = p_keep_open;
+	if (keep_open) {
+		if (server.is_null() || !server->is_active()) {
+			start();
+		}
+	} else {
+		bool found = false;
+		_for_all(tabs, [&](ScriptEditorDebugger *p_debugger) {
+			if (p_debugger->is_session_active()) {
+				found = true;
+			}
+		});
+		if (!found) {
+			stop();
+		}
+	}
+}
+
 Error EditorDebuggerNode::start(const String &p_uri) {
-	stop();
 	ERR_FAIL_COND_V(p_uri.find("://") < 0, ERR_INVALID_PARAMETER);
+	if (keep_open && current_uri == p_uri && server.is_valid()) {
+		return OK;
+	}
+	stop(true);
+	current_uri = p_uri;
 	if (EDITOR_GET("run/output/always_open_output_on_play")) {
 		EditorNode::get_singleton()->make_bottom_panel_item_visible(EditorNode::get_log());
 	} else {
@@ -225,7 +248,11 @@ Error EditorDebuggerNode::start(const String &p_uri) {
 	return OK;
 }
 
-void EditorDebuggerNode::stop() {
+void EditorDebuggerNode::stop(bool p_force) {
+	if (keep_open && !p_force) {
+		return;
+	}
+	current_uri.clear();
 	if (server.is_valid()) {
 		server->stop();
 		EditorNode::get_log()->add_message("--- Debugging process stopped ---", EditorLog::MSG_TYPE_EDITOR);
@@ -244,11 +271,6 @@ void EditorDebuggerNode::stop() {
 		}
 	});
 	_break_state_changed();
-	if (hide_on_stop) {
-		if (is_visible_in_tree()) {
-			EditorNode::get_singleton()->hide_bottom_panel();
-		}
-	}
 	breakpoints.clear();
 	set_process(false);
 }
@@ -428,7 +450,6 @@ void EditorDebuggerNode::set_script_debug_button(MenuButton *p_button) {
 	p->add_shortcut(ED_GET_SHORTCUT("debugger/break"), DEBUG_BREAK);
 	p->add_shortcut(ED_GET_SHORTCUT("debugger/continue"), DEBUG_CONTINUE);
 	p->add_separator();
-	p->add_check_shortcut(ED_GET_SHORTCUT("debugger/keep_debugger_open"), DEBUG_KEEP_DEBUGGER_OPEN);
 	p->add_check_shortcut(ED_GET_SHORTCUT("debugger/debug_with_external_editor"), DEBUG_WITH_EXTERNAL_EDITOR);
 	p->connect("id_pressed", callable_mp(this, &EditorDebuggerNode::_menu_option));
 
@@ -468,12 +489,6 @@ void EditorDebuggerNode::_menu_option(int p_id) {
 		case DEBUG_CONTINUE: {
 			debug_continue();
 		} break;
-		case DEBUG_KEEP_DEBUGGER_OPEN: {
-			bool ischecked = script_menu->get_popup()->is_item_checked(script_menu->get_popup()->get_item_index(DEBUG_KEEP_DEBUGGER_OPEN));
-			hide_on_stop = ischecked;
-			script_menu->get_popup()->set_item_checked(script_menu->get_popup()->get_item_index(DEBUG_KEEP_DEBUGGER_OPEN), !ischecked);
-			EditorSettings::get_singleton()->set_project_metadata("debug_options", "keep_debugger_open", !ischecked);
-		} break;
 		case DEBUG_WITH_EXTERNAL_EDITOR: {
 			bool ischecked = script_menu->get_popup()->is_item_checked(script_menu->get_popup()->get_item_index(DEBUG_WITH_EXTERNAL_EDITOR));
 			debug_with_external_editor = !ischecked;
@@ -484,9 +499,6 @@ void EditorDebuggerNode::_menu_option(int p_id) {
 }
 
 void EditorDebuggerNode::_update_debug_options() {
-	if (EditorSettings::get_singleton()->get_project_metadata("debug_options", "keep_debugger_open", false).operator bool()) {
-		_menu_option(DEBUG_KEEP_DEBUGGER_OPEN);
-	}
 	if (EditorSettings::get_singleton()->get_project_metadata("debug_options", "debug_with_external_editor", false).operator bool()) {
 		_menu_option(DEBUG_WITH_EXTERNAL_EDITOR);
 	}
diff --git a/editor/debugger/editor_debugger_node.h b/editor/debugger/editor_debugger_node.h
index 7f7279ae74..030f4c7d12 100644
--- a/editor/debugger/editor_debugger_node.h
+++ b/editor/debugger/editor_debugger_node.h
@@ -63,7 +63,6 @@ private:
 		DEBUG_STEP,
 		DEBUG_BREAK,
 		DEBUG_CONTINUE,
-		DEBUG_KEEP_DEBUGGER_OPEN,
 		DEBUG_WITH_EXTERNAL_EDITOR,
 	};
 
@@ -110,7 +109,9 @@ private:
 	float remote_scene_tree_timeout = 0.0;
 	bool auto_switch_remote_scene_tree = false;
 	bool debug_with_external_editor = false;
-	bool hide_on_stop = true;
+	bool keep_open = false;
+	String current_uri;
+
 	CameraOverride camera_override = OVERRIDE_NONE;
 	HashMap<Breakpoint, bool, Breakpoint> breakpoints;
 
@@ -203,8 +204,9 @@ public:
 
 	String get_server_uri() const;
 
+	void set_keep_open(bool p_keep_open);
 	Error start(const String &p_uri = "tcp://");
-	void stop();
+	void stop(bool p_force = false);
 
 	bool plugins_capture(ScriptEditorDebugger *p_debugger, const String &p_message, const Array &p_data);
 	void add_debugger_plugin(const Ref<EditorDebuggerPlugin> &p_plugin);
diff --git a/editor/debugger/script_editor_debugger.cpp b/editor/debugger/script_editor_debugger.cpp
index 178010d852..d419e16b12 100644
--- a/editor/debugger/script_editor_debugger.cpp
+++ b/editor/debugger/script_editor_debugger.cpp
@@ -1656,6 +1656,8 @@ ScriptEditorDebugger::ScriptEditorDebugger() {
 
 	add_child(tabs);
 
+	InspectorDock::get_inspector_singleton()->connect("object_id_selected", callable_mp(this, &ScriptEditorDebugger::_remote_object_selected));
+
 	{ //debugger
 		VBoxContainer *vbc = memnew(VBoxContainer);
 		vbc->set_name(TTR("Debugger"));
diff --git a/editor/editor_inspector.cpp b/editor/editor_inspector.cpp
index 6e18bde303..4822aab7d4 100644
--- a/editor/editor_inspector.cpp
+++ b/editor/editor_inspector.cpp
@@ -3114,11 +3114,24 @@ void EditorInspector::update_tree() {
 			// Build the doc hint, to use as tooltip.
 
 			// Get the class name.
-			StringName classname = doc_name == "" ? object->get_class_name() : doc_name;
+			StringName classname = doc_name;
 			if (!object_class.is_empty()) {
 				classname = object_class;
 			} else if (Object::cast_to<MultiNodeEdit>(object)) {
 				classname = Object::cast_to<MultiNodeEdit>(object)->get_edited_class_name();
+			} else if (classname == "") {
+				classname = object->get_class_name();
+				Resource *res = Object::cast_to<Resource>(object);
+				if (res && !res->get_script().is_null()) {
+					// Grab the script of this resource to get the evaluated script class.
+					Ref<Script> scr = res->get_script();
+					if (scr.is_valid()) {
+						Vector<DocData::ClassDoc> docs = scr->get_documentation();
+						if (!docs.is_empty()) {
+							classname = docs[0].name;
+						}
+					}
+				}
 			}
 
 			StringName propname = property_prefix + p.name;
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 995cfb138c..2f6481704a 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -4883,7 +4883,7 @@ void EditorNode::_load_docks_from_config(Ref<ConfigFile> p_layout, const String
 
 		Vector<String> names = String(p_layout->get_value(p_section, "dock_" + itos(i + 1))).split(",");
 
-		for (int j = 0; j < names.size(); j++) {
+		for (int j = names.size() - 1; j >= 0; j--) {
 			String name = names[j];
 			// FIXME: Find it, in a horribly inefficient way.
 			int atidx = -1;
@@ -4904,7 +4904,7 @@ void EditorNode::_load_docks_from_config(Ref<ConfigFile> p_layout, const String
 			}
 
 			if (atidx == i) {
-				node->move_to_front();
+				dock_slot[i]->move_child(node, 0);
 				continue;
 			}
 
@@ -4914,6 +4914,7 @@ void EditorNode::_load_docks_from_config(Ref<ConfigFile> p_layout, const String
 				dock_slot[atidx]->hide();
 			}
 			dock_slot[i]->add_child(node);
+			dock_slot[i]->move_child(node, 0);
 			dock_slot[i]->show();
 		}
 	}
@@ -7036,7 +7037,7 @@ EditorNode::EditorNode() {
 	// Dock numbers are based on DockSlot enum value + 1.
 	default_layout->set_value(docks_section, "dock_3", "Scene,Import");
 	default_layout->set_value(docks_section, "dock_4", "FileSystem");
-	default_layout->set_value(docks_section, "dock_5", "Inspector,Node");
+	default_layout->set_value(docks_section, "dock_5", "Inspector,Node,History");
 
 	for (int i = 0; i < vsplits.size(); i++) {
 		default_layout->set_value(docks_section, "dock_split_" + itos(i + 1), 0);
diff --git a/editor/editor_property_name_processor.cpp b/editor/editor_property_name_processor.cpp
index a2dfa4f80e..9587e65cad 100644
--- a/editor/editor_property_name_processor.cpp
+++ b/editor/editor_property_name_processor.cpp
@@ -64,6 +64,10 @@ String EditorPropertyNameProcessor::_capitalize_name(const String &p_name) const
 
 	Vector<String> parts = p_name.split("_", false);
 	for (int i = 0; i < parts.size(); i++) {
+		// Articles/conjunctions/prepositions which should only be capitalized when not at beginning and end.
+		if (i > 0 && i + 1 < parts.size() && stop_words.find(parts[i]) != -1) {
+			continue;
+		}
 		HashMap<String, String>::ConstIterator remap = capitalize_string_remaps.find(parts[i]);
 		if (remap) {
 			parts.write[i] = remap->value;
@@ -143,6 +147,7 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 	capitalize_string_remaps["gdscript"] = "GDScript";
 	capitalize_string_remaps["ggx"] = "GGX";
 	capitalize_string_remaps["gi"] = "GI";
+	capitalize_string_remaps["gl"] = "GL";
 	capitalize_string_remaps["glb"] = "GLB";
 	capitalize_string_remaps["gles2"] = "GLES2";
 	capitalize_string_remaps["gles3"] = "GLES3";
@@ -157,6 +162,7 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 	capitalize_string_remaps["html"] = "HTML";
 	capitalize_string_remaps["http"] = "HTTP";
 	capitalize_string_remaps["id"] = "ID";
+	capitalize_string_remaps["ids"] = "IDs";
 	capitalize_string_remaps["igd"] = "IGD";
 	capitalize_string_remaps["ik"] = "IK";
 	capitalize_string_remaps["image@2x"] = "Image @2x";
@@ -223,6 +229,7 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 	capitalize_string_remaps["svg"] = "SVG";
 	capitalize_string_remaps["taa"] = "TAA";
 	capitalize_string_remaps["tcp"] = "TCP";
+	capitalize_string_remaps["tls"] = "TLS";
 	capitalize_string_remaps["ui"] = "UI";
 	capitalize_string_remaps["url"] = "URL";
 	capitalize_string_remaps["urls"] = "URLs";
@@ -237,6 +244,7 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 	capitalize_string_remaps["vector2"] = "Vector2";
 	capitalize_string_remaps["vpn"] = "VPN";
 	capitalize_string_remaps["vram"] = "VRAM";
+	capitalize_string_remaps["vrs"] = "VRS";
 	capitalize_string_remaps["vsync"] = "V-Sync";
 	capitalize_string_remaps["wap"] = "WAP";
 	capitalize_string_remaps["webp"] = "WebP";
@@ -246,9 +254,33 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 	capitalize_string_remaps["wifi"] = "Wi-Fi";
 	capitalize_string_remaps["x86"] = "x86";
 	capitalize_string_remaps["xr"] = "XR";
+	capitalize_string_remaps["xray"] = "X-Ray";
 	capitalize_string_remaps["xy"] = "XY";
 	capitalize_string_remaps["xz"] = "XZ";
 	capitalize_string_remaps["yz"] = "YZ";
+
+	// Articles, conjunctions, prepositions.
+	// The following initialization is parsed in `editor/translations/extract.py` with a regex.
+	// The word definition format should be kept synced with the regex.
+	stop_words = LocalVector<String>({
+			"a",
+			"an",
+			"and",
+			"as",
+			"at",
+			"by",
+			"for",
+			"in",
+			"not",
+			"of",
+			"on",
+			"or",
+			"over",
+			"per",
+			"the",
+			"then",
+			"to",
+	});
 }
 
 EditorPropertyNameProcessor::~EditorPropertyNameProcessor() {
diff --git a/editor/editor_property_name_processor.h b/editor/editor_property_name_processor.h
index 37d905c806..fcabbfd9d3 100644
--- a/editor/editor_property_name_processor.h
+++ b/editor/editor_property_name_processor.h
@@ -40,6 +40,7 @@ class EditorPropertyNameProcessor : public Node {
 
 	mutable HashMap<String, String> capitalize_string_cache;
 	HashMap<String, String> capitalize_string_remaps;
+	LocalVector<String> stop_words; // Exceptions that shouldn't be capitalized.
 
 	// Capitalizes property path segments.
 	String _capitalize_name(const String &p_name) const;
diff --git a/editor/find_in_files.cpp b/editor/find_in_files.cpp
index b7605d8e2b..666444eaf9 100644
--- a/editor/find_in_files.cpp
+++ b/editor/find_in_files.cpp
@@ -606,6 +606,7 @@ FindInFilesPanel::FindInFilesPanel() {
 	_results_display->set_hide_root(true);
 	_results_display->set_select_mode(Tree::SELECT_ROW);
 	_results_display->set_allow_rmb_select(true);
+	_results_display->set_allow_reselect(true);
 	_results_display->create_item(); // Root
 	vbc->add_child(_results_display);
 
@@ -717,6 +718,10 @@ void FindInFilesPanel::_on_result_found(String fpath, int line_number, int begin
 		file_item = E->value;
 	}
 
+	Color file_item_color = _results_display->get_theme_color(SNAME("font_color")) * Color(1, 1, 1, 0.67);
+	file_item->set_custom_color(0, file_item_color);
+	file_item->set_selectable(0, false);
+
 	int text_index = _with_replace ? 1 : 0;
 
 	TreeItem *item = _results_display->create_item(file_item);
@@ -763,13 +768,13 @@ void FindInFilesPanel::draw_result_text(Object *item_obj, Rect2 rect) {
 	int font_size = _results_display->get_theme_font_size(SNAME("font_size"));
 
 	Rect2 match_rect = rect;
-	match_rect.position.x += font->get_string_size(item_text.left(r.begin_trimmed), HORIZONTAL_ALIGNMENT_LEFT, -1, font_size).x;
-	match_rect.size.x = font->get_string_size(_search_text_label->get_text(), HORIZONTAL_ALIGNMENT_LEFT, -1, font_size).x;
+	match_rect.position.x += font->get_string_size(item_text.left(r.begin_trimmed), HORIZONTAL_ALIGNMENT_LEFT, -1, font_size).x - 1;
+	match_rect.size.x = font->get_string_size(_search_text_label->get_text(), HORIZONTAL_ALIGNMENT_LEFT, -1, font_size).x + 2;
 	match_rect.position.y += 1 * EDSCALE;
 	match_rect.size.y -= 2 * EDSCALE;
 
-	// Use the inverted accent color to help match rectangles stand out even on the currently selected line.
-	_results_display->draw_rect(match_rect, get_theme_color(SNAME("accent_color"), SNAME("Editor")).inverted() * Color(1, 1, 1, 0.5));
+	_results_display->draw_rect(match_rect, get_theme_color(SNAME("accent_color"), SNAME("Editor")) * Color(1, 1, 1, 0.33), false, 2.0);
+	_results_display->draw_rect(match_rect, get_theme_color(SNAME("accent_color"), SNAME("Editor")) * Color(1, 1, 1, 0.17), true);
 
 	// Text is drawn by Tree already.
 }
@@ -777,14 +782,12 @@ void FindInFilesPanel::draw_result_text(Object *item_obj, Rect2 rect) {
 void FindInFilesPanel::_on_item_edited() {
 	TreeItem *item = _results_display->get_selected();
 
-	if (item->is_checked(0)) {
-		item->set_custom_color(1, _results_display->get_theme_color(SNAME("font_color")));
-	} else {
-		// Grey out.
-		Color color = _results_display->get_theme_color(SNAME("font_color"));
-		color.a /= 2.0;
-		item->set_custom_color(1, color);
+	// Change opacity to half if checkbox is checked, otherwise full.
+	Color use_color = _results_display->get_theme_color(SNAME("font_color"));
+	if (!item->is_checked(0)) {
+		use_color.a *= 0.5;
 	}
+	item->set_custom_color(1, use_color);
 }
 
 void FindInFilesPanel::_on_finished() {
@@ -793,11 +796,11 @@ void FindInFilesPanel::_on_finished() {
 	int file_count = _file_items.size();
 
 	if (result_count == 1 && file_count == 1) {
-		results_text = vformat(TTR("%d match in %d file."), result_count, file_count);
+		results_text = vformat(TTR("%d match in %d file"), result_count, file_count);
 	} else if (result_count != 1 && file_count == 1) {
-		results_text = vformat(TTR("%d matches in %d file."), result_count, file_count);
+		results_text = vformat(TTR("%d matches in %d file"), result_count, file_count);
 	} else {
-		results_text = vformat(TTR("%d matches in %d files."), result_count, file_count);
+		results_text = vformat(TTR("%d matches in %d files"), result_count, file_count);
 	}
 
 	_status_label->set_text(results_text);
diff --git a/editor/history_dock.cpp b/editor/history_dock.cpp
index 47b7e9f5d7..93599eff56 100644
--- a/editor/history_dock.cpp
+++ b/editor/history_dock.cpp
@@ -206,7 +206,7 @@ void HistoryDock::seek_history(int p_index) {
 
 void HistoryDock::_notification(int p_notification) {
 	switch (p_notification) {
-		case NOTIFICATION_ENTER_TREE: {
+		case NOTIFICATION_READY: {
 			EditorNode::get_singleton()->connect("scene_changed", callable_mp(this, &HistoryDock::on_history_changed));
 		} break;
 
diff --git a/editor/plugins/animation_blend_space_1d_editor.cpp b/editor/plugins/animation_blend_space_1d_editor.cpp
index d11217cc5e..03c801c42b 100644
--- a/editor/plugins/animation_blend_space_1d_editor.cpp
+++ b/editor/plugins/animation_blend_space_1d_editor.cpp
@@ -153,6 +153,7 @@ void AnimationNodeBlendSpace1DEditor::_blend_space_gui_input(const Ref<InputEven
 				}
 
 				updating = true;
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Move Node Point"));
 				undo_redo->add_do_method(blend_space.ptr(), "set_blend_point_position", selected_point, point);
 				undo_redo->add_undo_method(blend_space.ptr(), "set_blend_point_position", selected_point, blend_space->get_blend_point_position(selected_point));
@@ -341,6 +342,7 @@ void AnimationNodeBlendSpace1DEditor::_config_changed(double) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change BlendSpace1D Config"));
 	undo_redo->add_do_method(blend_space.ptr(), "set_max_space", max_value->get_value());
 	undo_redo->add_undo_method(blend_space.ptr(), "set_max_space", blend_space->get_max_space());
@@ -364,6 +366,7 @@ void AnimationNodeBlendSpace1DEditor::_labels_changed(String) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change BlendSpace1D Labels"), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_method(blend_space.ptr(), "set_value_label", label_value->get_text());
 	undo_redo->add_undo_method(blend_space.ptr(), "set_value_label", blend_space->get_value_label());
@@ -419,6 +422,7 @@ void AnimationNodeBlendSpace1DEditor::_add_menu_type(int p_index) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Node Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "add_blend_point", node, add_point_pos);
 	undo_redo->add_undo_method(blend_space.ptr(), "remove_blend_point", blend_space->get_blend_point_count());
@@ -437,6 +441,7 @@ void AnimationNodeBlendSpace1DEditor::_add_animation_type(int p_index) {
 	anim->set_animation(animations_to_add[p_index]);
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Animation Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "add_blend_point", anim, add_point_pos);
 	undo_redo->add_undo_method(blend_space.ptr(), "remove_blend_point", blend_space->get_blend_point_count());
@@ -510,6 +515,7 @@ void AnimationNodeBlendSpace1DEditor::_erase_selected() {
 	if (selected_point != -1) {
 		updating = true;
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Remove BlendSpace1D Point"));
 		undo_redo->add_do_method(blend_space.ptr(), "remove_blend_point", selected_point);
 		undo_redo->add_undo_method(blend_space.ptr(), "add_blend_point", blend_space->get_blend_point_node(selected_point), blend_space->get_blend_point_position(selected_point), selected_point);
@@ -529,6 +535,7 @@ void AnimationNodeBlendSpace1DEditor::_edit_point_pos(double) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Move BlendSpace1D Node Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "set_blend_point_position", selected_point, edit_value->get_value());
 	undo_redo->add_undo_method(blend_space.ptr(), "set_blend_point_position", selected_point, blend_space->get_blend_point_position(selected_point));
@@ -762,8 +769,6 @@ AnimationNodeBlendSpace1DEditor::AnimationNodeBlendSpace1DEditor() {
 	error_panel->add_child(error_label);
 	error_label->set_text("hmmm");
 
-	undo_redo = EditorNode::get_undo_redo();
-
 	menu = memnew(PopupMenu);
 	add_child(menu);
 	menu->connect("id_pressed", callable_mp(this, &AnimationNodeBlendSpace1DEditor::_add_menu_type));
@@ -778,7 +783,6 @@ AnimationNodeBlendSpace1DEditor::AnimationNodeBlendSpace1DEditor() {
 	open_file->set_title(TTR("Open Animation Node"));
 	open_file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	open_file->connect("file_selected", callable_mp(this, &AnimationNodeBlendSpace1DEditor::_file_opened));
-	undo_redo = EditorNode::get_undo_redo();
 
 	set_custom_minimum_size(Size2(0, 150 * EDSCALE));
 }
diff --git a/editor/plugins/animation_blend_space_1d_editor.h b/editor/plugins/animation_blend_space_1d_editor.h
index 54aa227c96..ec07678b27 100644
--- a/editor/plugins/animation_blend_space_1d_editor.h
+++ b/editor/plugins/animation_blend_space_1d_editor.h
@@ -80,8 +80,6 @@ class AnimationNodeBlendSpace1DEditor : public AnimationTreeNodeEditorPlugin {
 
 	bool updating = false;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	static AnimationNodeBlendSpace1DEditor *singleton;
 
 	void _blend_space_gui_input(const Ref<InputEvent> &p_event);
diff --git a/editor/plugins/animation_blend_space_2d_editor.cpp b/editor/plugins/animation_blend_space_2d_editor.cpp
index 0adac598a6..f27fd0cded 100644
--- a/editor/plugins/animation_blend_space_2d_editor.cpp
+++ b/editor/plugins/animation_blend_space_2d_editor.cpp
@@ -222,6 +222,7 @@ void AnimationNodeBlendSpace2DEditor::_blend_space_gui_input(const Ref<InputEven
 					}
 
 					updating = true;
+					Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 					undo_redo->create_action(TTR("Add Triangle"));
 					undo_redo->add_do_method(blend_space.ptr(), "add_triangle", making_triangle[0], making_triangle[1], making_triangle[2]);
 					undo_redo->add_undo_method(blend_space.ptr(), "remove_triangle", blend_space->get_triangle_count());
@@ -247,6 +248,7 @@ void AnimationNodeBlendSpace2DEditor::_blend_space_gui_input(const Ref<InputEven
 
 			if (!read_only) {
 				updating = true;
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Move Node Point"));
 				undo_redo->add_do_method(blend_space.ptr(), "set_blend_point_position", selected_point, point);
 				undo_redo->add_undo_method(blend_space.ptr(), "set_blend_point_position", selected_point, blend_space->get_blend_point_position(selected_point));
@@ -354,6 +356,7 @@ void AnimationNodeBlendSpace2DEditor::_add_menu_type(int p_index) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Node Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "add_blend_point", node, add_point_pos);
 	undo_redo->add_undo_method(blend_space.ptr(), "remove_blend_point", blend_space->get_blend_point_count());
@@ -372,6 +375,7 @@ void AnimationNodeBlendSpace2DEditor::_add_animation_type(int p_index) {
 	anim->set_animation(animations_to_add[p_index]);
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Animation Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "add_blend_point", anim, add_point_pos);
 	undo_redo->add_undo_method(blend_space.ptr(), "remove_blend_point", blend_space->get_blend_point_count());
@@ -661,6 +665,7 @@ void AnimationNodeBlendSpace2DEditor::_config_changed(double) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change BlendSpace2D Config"));
 	undo_redo->add_do_method(blend_space.ptr(), "set_max_space", Vector2(max_x_value->get_value(), max_y_value->get_value()));
 	undo_redo->add_undo_method(blend_space.ptr(), "set_max_space", blend_space->get_max_space());
@@ -686,6 +691,7 @@ void AnimationNodeBlendSpace2DEditor::_labels_changed(String) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change BlendSpace2D Labels"), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_method(blend_space.ptr(), "set_x_label", label_x->get_text());
 	undo_redo->add_undo_method(blend_space.ptr(), "set_x_label", blend_space->get_x_label());
@@ -698,6 +704,7 @@ void AnimationNodeBlendSpace2DEditor::_labels_changed(String) {
 }
 
 void AnimationNodeBlendSpace2DEditor::_erase_selected() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (selected_point != -1) {
 		updating = true;
 		undo_redo->create_action(TTR("Remove BlendSpace2D Point"));
@@ -760,6 +767,7 @@ void AnimationNodeBlendSpace2DEditor::_edit_point_pos(double) {
 		return;
 	}
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Move Node Point"));
 	undo_redo->add_do_method(blend_space.ptr(), "set_blend_point_position", selected_point, Vector2(edit_x->get_value(), edit_y->get_value()));
 	undo_redo->add_undo_method(blend_space.ptr(), "set_blend_point_position", selected_point, blend_space->get_blend_point_position(selected_point));
@@ -836,6 +844,7 @@ void AnimationNodeBlendSpace2DEditor::_removed_from_graph() {
 }
 
 void AnimationNodeBlendSpace2DEditor::_auto_triangles_toggled() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Toggle Auto Triangles"));
 	undo_redo->add_do_method(blend_space.ptr(), "set_auto_triangles", auto_triangles->is_pressed());
 	undo_redo->add_undo_method(blend_space.ptr(), "set_auto_triangles", blend_space->get_auto_triangles());
@@ -1059,8 +1068,6 @@ AnimationNodeBlendSpace2DEditor::AnimationNodeBlendSpace2DEditor() {
 	error_panel->add_child(error_label);
 	error_label->set_text("eh");
 
-	undo_redo = EditorNode::get_undo_redo();
-
 	set_custom_minimum_size(Size2(0, 300 * EDSCALE));
 
 	menu = memnew(PopupMenu);
@@ -1077,7 +1084,6 @@ AnimationNodeBlendSpace2DEditor::AnimationNodeBlendSpace2DEditor() {
 	open_file->set_title(TTR("Open Animation Node"));
 	open_file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	open_file->connect("file_selected", callable_mp(this, &AnimationNodeBlendSpace2DEditor::_file_opened));
-	undo_redo = EditorNode::get_undo_redo();
 
 	selected_point = -1;
 	selected_triangle = -1;
diff --git a/editor/plugins/animation_blend_space_2d_editor.h b/editor/plugins/animation_blend_space_2d_editor.h
index e4512b78a3..60873e5473 100644
--- a/editor/plugins/animation_blend_space_2d_editor.h
+++ b/editor/plugins/animation_blend_space_2d_editor.h
@@ -44,8 +44,6 @@ class CheckBox;
 class OptionButton;
 class PanelContainer;
 
-class EditorUndoRedoManager;
-
 class AnimationNodeBlendSpace2DEditor : public AnimationTreeNodeEditorPlugin {
 	GDCLASS(AnimationNodeBlendSpace2DEditor, AnimationTreeNodeEditorPlugin);
 
@@ -89,8 +87,6 @@ class AnimationNodeBlendSpace2DEditor : public AnimationTreeNodeEditorPlugin {
 
 	bool updating;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	static AnimationNodeBlendSpace2DEditor *singleton;
 
 	void _blend_space_gui_input(const Ref<InputEvent> &p_event);
diff --git a/editor/plugins/animation_blend_tree_editor_plugin.cpp b/editor/plugins/animation_blend_tree_editor_plugin.cpp
index 1a4774b98b..9da6e15e75 100644
--- a/editor/plugins/animation_blend_tree_editor_plugin.cpp
+++ b/editor/plugins/animation_blend_tree_editor_plugin.cpp
@@ -98,6 +98,7 @@ Size2 AnimationNodeBlendTreeEditor::get_minimum_size() const {
 void AnimationNodeBlendTreeEditor::_property_changed(const StringName &p_property, const Variant &p_value, const String &p_field, bool p_changing) {
 	AnimationTree *tree = AnimationTreeEditor::get_singleton()->get_animation_tree();
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Parameter Changed:") + " " + String(p_property), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_property(tree, p_property, p_value);
 	undo_redo->add_undo_property(tree, p_property, tree->get(p_property));
@@ -353,6 +354,7 @@ void AnimationNodeBlendTreeEditor::_add_node(int p_idx) {
 		name = base_name + " " + itos(base);
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Node to BlendTree"));
 	undo_redo->add_do_method(blend_tree.ptr(), "add_node", name, anode, instance_pos / EDSCALE);
 	undo_redo->add_undo_method(blend_tree.ptr(), "remove_node", name);
@@ -416,6 +418,7 @@ void AnimationNodeBlendTreeEditor::_connection_from_empty(const String &p_to, in
 
 void AnimationNodeBlendTreeEditor::_node_dragged(const Vector2 &p_from, const Vector2 &p_to, const StringName &p_which) {
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Node Moved"));
 	undo_redo->add_do_method(blend_tree.ptr(), "set_node_position", p_which, p_to / EDSCALE);
 	undo_redo->add_undo_method(blend_tree.ptr(), "set_node_position", p_which, p_from / EDSCALE);
@@ -437,6 +440,7 @@ void AnimationNodeBlendTreeEditor::_connection_request(const String &p_from, int
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Nodes Connected"));
 	undo_redo->add_do_method(blend_tree.ptr(), "connect_node", p_to, p_to_index, p_from);
 	undo_redo->add_undo_method(blend_tree.ptr(), "disconnect_node", p_to, p_to_index);
@@ -453,6 +457,7 @@ void AnimationNodeBlendTreeEditor::_disconnection_request(const String &p_from,
 	graph->disconnect_node(p_from, p_from_index, p_to, p_to_index);
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Nodes Disconnected"));
 	undo_redo->add_do_method(blend_tree.ptr(), "disconnect_node", p_to, p_to_index);
 	undo_redo->add_undo_method(blend_tree.ptr(), "connect_node", p_to, p_to_index, p_from);
@@ -468,6 +473,7 @@ void AnimationNodeBlendTreeEditor::_anim_selected(int p_index, Array p_options,
 	Ref<AnimationNodeAnimation> anim = blend_tree->get_node(p_node);
 	ERR_FAIL_COND(!anim.is_valid());
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Animation"));
 	undo_redo->add_do_method(anim.ptr(), "set_animation", option);
 	undo_redo->add_undo_method(anim.ptr(), "set_animation", anim->get_animation());
@@ -481,6 +487,7 @@ void AnimationNodeBlendTreeEditor::_delete_request(const String &p_which) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Node"));
 	undo_redo->add_do_method(blend_tree.ptr(), "remove_node", p_which);
 	undo_redo->add_undo_method(blend_tree.ptr(), "add_node", p_which, blend_tree->get_node(p_which), blend_tree.ptr()->get_node_position(p_which));
@@ -525,6 +532,7 @@ void AnimationNodeBlendTreeEditor::_delete_nodes_request(const TypedArray<String
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Node(s)"));
 
 	for (const StringName &F : to_erase) {
@@ -558,6 +566,7 @@ void AnimationNodeBlendTreeEditor::_open_in_editor(const String &p_which) {
 
 void AnimationNodeBlendTreeEditor::_filter_toggled() {
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Toggle Filter On/Off"));
 	undo_redo->add_do_method(_filter_edit.ptr(), "set_filter_enabled", filter_enabled->is_pressed());
 	undo_redo->add_undo_method(_filter_edit.ptr(), "set_filter_enabled", _filter_edit->is_filter_enabled());
@@ -575,6 +584,7 @@ void AnimationNodeBlendTreeEditor::_filter_edited() {
 	bool filtered = edited->is_checked(0);
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Filter"));
 	undo_redo->add_do_method(_filter_edit.ptr(), "set_filter_path", edited_path, filtered);
 	undo_redo->add_undo_method(_filter_edit.ptr(), "set_filter_path", edited_path, _filter_edit->is_path_filtered(edited_path));
@@ -949,6 +959,7 @@ void AnimationNodeBlendTreeEditor::_node_renamed(const String &p_text, Ref<Anima
 	String base_path = AnimationTreeEditor::get_singleton()->get_base_path();
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Node Renamed"));
 	undo_redo->add_do_method(blend_tree.ptr(), "rename_node", prev_name, name);
 	undo_redo->add_undo_method(blend_tree.ptr(), "rename_node", name, prev_name);
@@ -1117,5 +1128,4 @@ AnimationNodeBlendTreeEditor::AnimationNodeBlendTreeEditor() {
 	open_file->set_title(TTR("Open Animation Node"));
 	open_file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	open_file->connect("file_selected", callable_mp(this, &AnimationNodeBlendTreeEditor::_file_opened));
-	undo_redo = EditorNode::get_undo_redo();
 }
diff --git a/editor/plugins/animation_blend_tree_editor_plugin.h b/editor/plugins/animation_blend_tree_editor_plugin.h
index 112c824d8e..fb19cce147 100644
--- a/editor/plugins/animation_blend_tree_editor_plugin.h
+++ b/editor/plugins/animation_blend_tree_editor_plugin.h
@@ -44,7 +44,6 @@ class CheckBox;
 class ProgressBar;
 class EditorFileDialog;
 class EditorProperty;
-class EditorUndoRedoManager;
 class MenuButton;
 class PanelContainer;
 
@@ -63,8 +62,6 @@ class AnimationNodeBlendTreeEditor : public AnimationTreeNodeEditorPlugin {
 	PanelContainer *error_panel = nullptr;
 	Label *error_label = nullptr;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	AcceptDialog *filter_dialog = nullptr;
 	Tree *filters = nullptr;
 	CheckBox *filter_enabled = nullptr;
diff --git a/editor/plugins/animation_library_editor.cpp b/editor/plugins/animation_library_editor.cpp
index 2d20c0cca7..e377366c0d 100644
--- a/editor/plugins/animation_library_editor.cpp
+++ b/editor/plugins/animation_library_editor.cpp
@@ -93,7 +93,7 @@ void AnimationLibraryEditor::_add_library_validate(const String &p_name) {
 void AnimationLibraryEditor::_add_library_confirm() {
 	if (adding_animation) {
 		String anim_name = add_library_name->get_text();
-		Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 		Ref<AnimationLibrary> al = player->call("get_animation_library", adding_animation_to_library);
 		ERR_FAIL_COND(!al.is_valid());
@@ -110,7 +110,7 @@ void AnimationLibraryEditor::_add_library_confirm() {
 
 	} else {
 		String lib_name = add_library_name->get_text();
-		Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 		Ref<AnimationLibrary> al;
 		al.instantiate();
@@ -210,7 +210,7 @@ void AnimationLibraryEditor::_file_popup_selected(int p_id) {
 				ald->add_animation(animation_name, animation);
 			}
 
-			Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 			undo_redo->create_action(vformat(TTR("Make Animation Library Unique: %s"), lib_name));
 			undo_redo->add_do_method(player, "remove_animation_library", lib_name);
 			undo_redo->add_do_method(player, "add_animation_library", lib_name, ald);
@@ -279,7 +279,7 @@ void AnimationLibraryEditor::_file_popup_selected(int p_id) {
 
 			Ref<Animation> animd = anim->duplicate();
 
-			Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 			undo_redo->create_action(vformat(TTR("Make Animation Unique: %s"), anim_name));
 			undo_redo->add_do_method(al.ptr(), "remove_animation", anim_name);
 			undo_redo->add_do_method(al.ptr(), "add_animation", anim_name, animd);
@@ -327,7 +327,7 @@ void AnimationLibraryEditor::_load_file(String p_path) {
 				name = p_path.get_file().get_basename() + " " + itos(attempt);
 			}
 
-			Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 			undo_redo->create_action(vformat(TTR("Add Animation Library: %s"), name));
 			undo_redo->add_do_method(player, "add_animation_library", name, al);
@@ -365,7 +365,7 @@ void AnimationLibraryEditor::_load_file(String p_path) {
 				name = p_path.get_file().get_basename() + " " + itos(attempt);
 			}
 
-			Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 			undo_redo->create_action(vformat(TTR("Load Animation into Library: %s"), name));
 			undo_redo->add_do_method(al.ptr(), "add_animation", name, anim);
@@ -381,7 +381,7 @@ void AnimationLibraryEditor::_load_file(String p_path) {
 			EditorNode::get_singleton()->save_resource_in_path(al, p_path);
 
 			if (al->get_path() != prev_path) { // Save successful.
-				Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 				undo_redo->create_action(vformat(TTR("Save Animation library to File: %s"), file_dialog_library));
 				undo_redo->add_do_method(al.ptr(), "set_path", al->get_path());
@@ -402,7 +402,7 @@ void AnimationLibraryEditor::_load_file(String p_path) {
 			String prev_path = anim->get_path();
 			EditorNode::get_singleton()->save_resource_in_path(anim, p_path);
 			if (anim->get_path() != prev_path) { // Save successful.
-				Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 				undo_redo->create_action(vformat(TTR("Save Animation to File: %s"), file_dialog_animation));
 				undo_redo->add_do_method(anim.ptr(), "set_path", anim->get_path());
@@ -420,7 +420,7 @@ void AnimationLibraryEditor::_item_renamed() {
 	String text = ti->get_text(0);
 	String old_text = ti->get_metadata(0);
 	bool restore_text = false;
-	Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	if (String(text).contains("/") || String(text).contains(":") || String(text).contains(",") || String(text).contains("[")) {
 		restore_text = true;
@@ -534,7 +534,7 @@ void AnimationLibraryEditor::_button_pressed(TreeItem *p_item, int p_column, int
 					name = base_name + " (" + itos(attempt) + ")";
 				}
 
-				Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 				undo_redo->create_action(vformat(TTR("Add Animation to Library: %s"), name));
 				undo_redo->add_do_method(al.ptr(), "add_animation", name, anim);
@@ -560,7 +560,7 @@ void AnimationLibraryEditor::_button_pressed(TreeItem *p_item, int p_column, int
 				file_dialog_library = lib_name;
 			} break;
 			case LIB_BUTTON_DELETE: {
-				Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 				undo_redo->create_action(vformat(TTR("Remove Animation Library: %s"), lib_name));
 				undo_redo->add_do_method(player, "remove_animation_library", lib_name);
 				undo_redo->add_undo_method(player, "add_animation_library", lib_name, al);
@@ -601,7 +601,7 @@ void AnimationLibraryEditor::_button_pressed(TreeItem *p_item, int p_column, int
 
 			} break;
 			case ANIM_BUTTON_DELETE: {
-				Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 				undo_redo->create_action(vformat(TTR("Remove Animation from Library: %s"), anim_name));
 				undo_redo->add_do_method(al.ptr(), "remove_animation", anim_name);
 				undo_redo->add_undo_method(al.ptr(), "add_animation", anim_name, anim);
diff --git a/editor/plugins/animation_state_machine_editor.cpp b/editor/plugins/animation_state_machine_editor.cpp
index 060e9d0d10..bd34d3808d 100644
--- a/editor/plugins/animation_state_machine_editor.cpp
+++ b/editor/plugins/animation_state_machine_editor.cpp
@@ -238,6 +238,7 @@ void AnimationNodeStateMachineEditor::_state_machine_gui_input(const Ref<InputEv
 			Ref<AnimationNode> an = state_machine->get_node(selected_node);
 			updating = true;
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Move Node"));
 
 			for (int i = 0; i < node_rects.size(); i++) {
@@ -534,6 +535,7 @@ void AnimationNodeStateMachineEditor::_group_selected_nodes() {
 		}
 
 		updating = true;
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action("Group");
 
 		// Move selected nodes to the new state machine
@@ -648,6 +650,7 @@ void AnimationNodeStateMachineEditor::_ungroup_selected_nodes() {
 			Vector<TransitionUR> transitions_ur;
 
 			updating = true;
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action("Ungroup");
 
 			// Move all child nodes to current state machine
@@ -921,6 +924,7 @@ void AnimationNodeStateMachineEditor::_stop_connecting() {
 
 void AnimationNodeStateMachineEditor::_delete_selected() {
 	TreeItem *item = delete_tree->get_next_selected(nullptr);
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	while (item) {
 		if (!updating) {
 			updating = true;
@@ -948,6 +952,7 @@ void AnimationNodeStateMachineEditor::_delete_all() {
 	selected_multi_transition = TransitionLine();
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action("Transition(s) Removed");
 	_erase_selected(true);
 	for (int i = 0; i < multi_transitions.size(); i++) {
@@ -1027,6 +1032,7 @@ void AnimationNodeStateMachineEditor::_add_menu_type(int p_index) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Node and Transition"));
 	undo_redo->add_do_method(state_machine.ptr(), "add_node", name, node, add_node_pos);
 	undo_redo->add_undo_method(state_machine.ptr(), "remove_node", name);
@@ -1053,6 +1059,7 @@ void AnimationNodeStateMachineEditor::_add_animation_type(int p_index) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Node and Transition"));
 	undo_redo->add_do_method(state_machine.ptr(), "add_node", name, anim, add_node_pos);
 	undo_redo->add_undo_method(state_machine.ptr(), "remove_node", name);
@@ -1081,6 +1088,7 @@ void AnimationNodeStateMachineEditor::_add_transition(const bool p_nested_action
 		tr.instantiate();
 		tr->set_switch_mode(AnimationNodeStateMachineTransition::SwitchMode(transition_mode->get_selected()));
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		if (!p_nested_action) {
 			updating = true;
 			undo_redo->create_action(TTR("Add Transition"));
@@ -1745,6 +1753,7 @@ void AnimationNodeStateMachineEditor::_name_edited(const String &p_text) {
 	}
 
 	updating = true;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Node Renamed"));
 	undo_redo->add_do_method(state_machine.ptr(), "rename_node", prev_name, name);
 	undo_redo->add_undo_method(state_machine.ptr(), "rename_node", name, prev_name);
@@ -1779,6 +1788,7 @@ void AnimationNodeStateMachineEditor::_erase_selected(const bool p_nested_action
 		if (!p_nested_action) {
 			updating = true;
 		}
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Node Removed"));
 
 		for (int i = 0; i < node_rects.size(); i++) {
@@ -1847,6 +1857,7 @@ void AnimationNodeStateMachineEditor::_erase_selected(const bool p_nested_action
 		if (!p_nested_action) {
 			updating = true;
 		}
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Transition Removed"));
 		undo_redo->add_do_method(state_machine.ptr(), "remove_transition", selected_transition_from, selected_transition_to);
 		undo_redo->add_undo_method(state_machine.ptr(), "add_transition", selected_transition_from, selected_transition_to, tr);
@@ -2013,8 +2024,6 @@ AnimationNodeStateMachineEditor::AnimationNodeStateMachineEditor() {
 	error_panel->add_child(error_label);
 	error_panel->hide();
 
-	undo_redo = EditorNode::get_undo_redo();
-
 	set_custom_minimum_size(Size2(0, 300 * EDSCALE));
 
 	menu = memnew(PopupMenu);
@@ -2055,7 +2064,6 @@ AnimationNodeStateMachineEditor::AnimationNodeStateMachineEditor() {
 	open_file->set_title(TTR("Open Animation Node"));
 	open_file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	open_file->connect("file_selected", callable_mp(this, &AnimationNodeStateMachineEditor::_file_opened));
-	undo_redo = EditorNode::get_undo_redo();
 
 	delete_window = memnew(ConfirmationDialog);
 	delete_window->set_flag(Window::FLAG_RESIZE_DISABLED, true);
diff --git a/editor/plugins/animation_state_machine_editor.h b/editor/plugins/animation_state_machine_editor.h
index 180f238834..5edf803c41 100644
--- a/editor/plugins/animation_state_machine_editor.h
+++ b/editor/plugins/animation_state_machine_editor.h
@@ -39,7 +39,6 @@
 
 class ConfirmationDialog;
 class EditorFileDialog;
-class EditorUndoRedoManager;
 class OptionButton;
 class PanelContainer;
 
@@ -80,8 +79,6 @@ class AnimationNodeStateMachineEditor : public AnimationTreeNodeEditorPlugin {
 
 	bool updating = false;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	static AnimationNodeStateMachineEditor *singleton;
 
 	void _state_machine_gui_input(const Ref<InputEvent> &p_event);
diff --git a/editor/plugins/cast_2d_editor_plugin.cpp b/editor/plugins/cast_2d_editor_plugin.cpp
index a8d255f997..d991cdf27f 100644
--- a/editor/plugins/cast_2d_editor_plugin.cpp
+++ b/editor/plugins/cast_2d_editor_plugin.cpp
@@ -77,6 +77,7 @@ bool Cast2DEditor::forward_canvas_gui_input(const Ref<InputEvent> &p_event) {
 				return false;
 			}
 		} else if (pressed) {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Set target_position"));
 			undo_redo->add_do_property(node, "target_position", target_position);
 			undo_redo->add_do_method(canvas_item_editor, "update_viewport");
@@ -130,10 +131,6 @@ void Cast2DEditor::edit(Node2D *p_node) {
 	canvas_item_editor->update_viewport();
 }
 
-Cast2DEditor::Cast2DEditor() {
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
-}
-
 ///////////////////////
 
 void Cast2DEditorPlugin::edit(Object *p_object) {
diff --git a/editor/plugins/cast_2d_editor_plugin.h b/editor/plugins/cast_2d_editor_plugin.h
index ceed9b9111..1165a301f6 100644
--- a/editor/plugins/cast_2d_editor_plugin.h
+++ b/editor/plugins/cast_2d_editor_plugin.h
@@ -35,12 +35,10 @@
 #include "scene/2d/node_2d.h"
 
 class CanvasItemEditor;
-class EditorUndoRedoManager;
 
 class Cast2DEditor : public Control {
 	GDCLASS(Cast2DEditor, Control);
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	CanvasItemEditor *canvas_item_editor = nullptr;
 	Node2D *node = nullptr;
 
@@ -55,8 +53,6 @@ public:
 	bool forward_canvas_gui_input(const Ref<InputEvent> &p_event);
 	void forward_canvas_draw_over_viewport(Control *p_overlay);
 	void edit(Node2D *p_node);
-
-	Cast2DEditor();
 };
 
 class Cast2DEditorPlugin : public EditorPlugin {
diff --git a/editor/plugins/collision_shape_2d_editor_plugin.cpp b/editor/plugins/collision_shape_2d_editor_plugin.cpp
index 11992ad10e..a7f842aa66 100644
--- a/editor/plugins/collision_shape_2d_editor_plugin.cpp
+++ b/editor/plugins/collision_shape_2d_editor_plugin.cpp
@@ -219,6 +219,7 @@ void CollisionShape2DEditor::set_handle(int idx, Point2 &p_point) {
 }
 
 void CollisionShape2DEditor::commit_handle(int idx, Variant &p_org) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Handle"));
 
 	switch (shape_type) {
@@ -588,8 +589,6 @@ CollisionShape2DEditor::CollisionShape2DEditor() {
 	node = nullptr;
 	canvas_item_editor = nullptr;
 
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
-
 	edit_handle = -1;
 	pressed = false;
 
diff --git a/editor/plugins/collision_shape_2d_editor_plugin.h b/editor/plugins/collision_shape_2d_editor_plugin.h
index 49e0820ae9..51cdab7396 100644
--- a/editor/plugins/collision_shape_2d_editor_plugin.h
+++ b/editor/plugins/collision_shape_2d_editor_plugin.h
@@ -35,7 +35,6 @@
 #include "scene/2d/collision_shape_2d.h"
 
 class CanvasItemEditor;
-class EditorUndoRedoManager;
 
 class CollisionShape2DEditor : public Control {
 	GDCLASS(CollisionShape2DEditor, Control);
@@ -62,7 +61,6 @@ class CollisionShape2DEditor : public Control {
 		Point2(1, -1),
 	};
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	CanvasItemEditor *canvas_item_editor = nullptr;
 	CollisionShape2D *node = nullptr;
 
diff --git a/editor/plugins/control_editor_plugin.cpp b/editor/plugins/control_editor_plugin.cpp
index 00b7845cee..c18876b9ef 100644
--- a/editor/plugins/control_editor_plugin.cpp
+++ b/editor/plugins/control_editor_plugin.cpp
@@ -721,6 +721,7 @@ void ControlEditorToolbar::_anchors_preset_selected(int p_preset) {
 	LayoutPreset preset = (LayoutPreset)p_preset;
 	List<Node *> selection = editor_selection->get_selected_node_list();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Anchors, Offsets, Grow Direction"));
 
 	for (Node *E : selection) {
@@ -740,6 +741,7 @@ void ControlEditorToolbar::_anchors_preset_selected(int p_preset) {
 void ControlEditorToolbar::_anchors_to_current_ratio() {
 	List<Node *> selection = editor_selection->get_selected_node_list();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Anchors, Offsets (Keep Ratio)"));
 
 	for (Node *E : selection) {
@@ -790,6 +792,7 @@ void ControlEditorToolbar::_anchor_mode_toggled(bool p_status) {
 void ControlEditorToolbar::_container_flags_selected(int p_flags, bool p_vertical) {
 	List<Node *> selection = editor_selection->get_selected_node_list();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (p_vertical) {
 		undo_redo->create_action(TTR("Change Vertical Size Flags"));
 	} else {
@@ -1025,7 +1028,6 @@ ControlEditorToolbar::ControlEditorToolbar() {
 	container_v_picker->connect("size_flags_selected", callable_mp(this, &ControlEditorToolbar::_container_flags_selected).bind(true));
 
 	// Editor connections.
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
 	editor_selection = EditorNode::get_singleton()->get_editor_selection();
 	editor_selection->add_editor_plugin(this);
 	editor_selection->connect("selection_changed", callable_mp(this, &ControlEditorToolbar::_selection_changed));
diff --git a/editor/plugins/control_editor_plugin.h b/editor/plugins/control_editor_plugin.h
index 14886e77a4..cf2c6f4e20 100644
--- a/editor/plugins/control_editor_plugin.h
+++ b/editor/plugins/control_editor_plugin.h
@@ -45,7 +45,6 @@
 #include "scene/gui/separator.h"
 #include "scene/gui/texture_rect.h"
 
-class EditorUndoRedoManager;
 class GridContainer;
 
 // Inspector controls.
@@ -207,7 +206,6 @@ public:
 class ControlEditorToolbar : public HBoxContainer {
 	GDCLASS(ControlEditorToolbar, HBoxContainer);
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	EditorSelection *editor_selection = nullptr;
 
 	ControlEditorPopupButton *anchors_button = nullptr;
diff --git a/editor/plugins/cpu_particles_2d_editor_plugin.cpp b/editor/plugins/cpu_particles_2d_editor_plugin.cpp
index a0e6771768..891a22cc71 100644
--- a/editor/plugins/cpu_particles_2d_editor_plugin.cpp
+++ b/editor/plugins/cpu_particles_2d_editor_plugin.cpp
@@ -34,7 +34,6 @@
 #include "core/io/image_loader.h"
 #include "editor/editor_file_dialog.h"
 #include "editor/editor_node.h"
-#include "editor/editor_undo_redo_manager.h"
 #include "scene/2d/cpu_particles_2d.h"
 #include "scene/gui/separator.h"
 #include "scene/resources/particle_process_material.h"
@@ -239,7 +238,6 @@ void CPUParticles2DEditorPlugin::_bind_methods() {
 
 CPUParticles2DEditorPlugin::CPUParticles2DEditorPlugin() {
 	particles = nullptr;
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	toolbar = memnew(HBoxContainer);
 	add_control_to_container(CONTAINER_CANVAS_EDITOR_MENU, toolbar);
diff --git a/editor/plugins/cpu_particles_2d_editor_plugin.h b/editor/plugins/cpu_particles_2d_editor_plugin.h
index 694162588b..1fc9ed763c 100644
--- a/editor/plugins/cpu_particles_2d_editor_plugin.h
+++ b/editor/plugins/cpu_particles_2d_editor_plugin.h
@@ -40,7 +40,6 @@ class CheckBox;
 class ConfirmationDialog;
 class SpinBox;
 class EditorFileDialog;
-class EditorUndoRedoManager;
 class MenuButton;
 class OptionButton;
 
@@ -74,7 +73,6 @@ class CPUParticles2DEditorPlugin : public EditorPlugin {
 
 	String source_emission_file;
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	void _file_selected(const String &p_file);
 	void _menu_callback(int p_idx);
 	void _generate_emission_mask();
diff --git a/editor/plugins/debugger_editor_plugin.cpp b/editor/plugins/debugger_editor_plugin.cpp
index dd6187c264..7d04880fb7 100644
--- a/editor/plugins/debugger_editor_plugin.cpp
+++ b/editor/plugins/debugger_editor_plugin.cpp
@@ -47,7 +47,6 @@ DebuggerEditorPlugin::DebuggerEditorPlugin(PopupMenu *p_debug_menu) {
 	ED_SHORTCUT("debugger/step_over", TTR("Step Over"), Key::F10);
 	ED_SHORTCUT("debugger/break", TTR("Break"));
 	ED_SHORTCUT("debugger/continue", TTR("Continue"), Key::F12);
-	ED_SHORTCUT("debugger/keep_debugger_open", TTR("Keep Debugger Open"));
 	ED_SHORTCUT("debugger/debug_with_external_editor", TTR("Debug with External Editor"));
 
 	// File Server for deploy with remote filesystem.
@@ -85,6 +84,9 @@ DebuggerEditorPlugin::DebuggerEditorPlugin(PopupMenu *p_debug_menu) {
 	debug_menu->add_check_shortcut(ED_SHORTCUT("editor/sync_script_changes", TTR("Synchronize Script Changes")), RUN_RELOAD_SCRIPTS);
 	debug_menu->set_item_tooltip(-1,
 			TTR("When this option is enabled, any script that is saved will be reloaded in the running project.\nWhen used remotely on a device, this is more efficient when the network filesystem option is enabled."));
+	debug_menu->add_check_shortcut(ED_SHORTCUT("editor/keep_server_open", TTR("Keep Debug Server Open")), SERVER_KEEP_OPEN);
+	debug_menu->set_item_tooltip(-1,
+			TTR("When this option is enabled, the editor debug server will stay open and listen for new sessions started outside of the editor itself."));
 
 	// Multi-instance, start/stop
 	instances_menu = memnew(PopupMenu);
@@ -176,6 +178,14 @@ void DebuggerEditorPlugin::_menu_option(int p_option) {
 			EditorSettings::get_singleton()->set_project_metadata("debug_options", "run_reload_scripts", !ischecked);
 
 		} break;
+		case SERVER_KEEP_OPEN: {
+			bool ischecked = debug_menu->is_item_checked(debug_menu->get_item_index(SERVER_KEEP_OPEN));
+			debug_menu->set_item_checked(debug_menu->get_item_index(SERVER_KEEP_OPEN), !ischecked);
+
+			EditorDebuggerNode::get_singleton()->set_keep_open(!ischecked);
+			EditorSettings::get_singleton()->set_project_metadata("debug_options", "server_keep_open", !ischecked);
+
+		} break;
 	}
 }
 
@@ -195,6 +205,7 @@ void DebuggerEditorPlugin::_update_debug_options() {
 	bool check_debug_navigation = EditorSettings::get_singleton()->get_project_metadata("debug_options", "run_debug_navigation", false);
 	bool check_live_debug = EditorSettings::get_singleton()->get_project_metadata("debug_options", "run_live_debug", true);
 	bool check_reload_scripts = EditorSettings::get_singleton()->get_project_metadata("debug_options", "run_reload_scripts", true);
+	bool check_server_keep_open = EditorSettings::get_singleton()->get_project_metadata("debug_options", "server_keep_open", false);
 	int instances = EditorSettings::get_singleton()->get_project_metadata("debug_options", "run_debug_instances", 1);
 
 	if (check_deploy_remote) {
@@ -218,6 +229,9 @@ void DebuggerEditorPlugin::_update_debug_options() {
 	if (check_reload_scripts) {
 		_menu_option(RUN_RELOAD_SCRIPTS);
 	}
+	if (check_server_keep_open) {
+		_menu_option(SERVER_KEEP_OPEN);
+	}
 
 	int len = instances_menu->get_item_count();
 	for (int idx = 0; idx < len; idx++) {
diff --git a/editor/plugins/debugger_editor_plugin.h b/editor/plugins/debugger_editor_plugin.h
index c706acdb5c..5f682ed5e0 100644
--- a/editor/plugins/debugger_editor_plugin.h
+++ b/editor/plugins/debugger_editor_plugin.h
@@ -53,6 +53,7 @@ private:
 		RUN_DEBUG_NAVIGATION,
 		RUN_DEPLOY_REMOTE_DEBUG,
 		RUN_RELOAD_SCRIPTS,
+		SERVER_KEEP_OPEN,
 	};
 
 	void _update_debug_options();
diff --git a/editor/plugins/gpu_particles_2d_editor_plugin.cpp b/editor/plugins/gpu_particles_2d_editor_plugin.cpp
index 891b9efa4f..87381b3221 100644
--- a/editor/plugins/gpu_particles_2d_editor_plugin.cpp
+++ b/editor/plugins/gpu_particles_2d_editor_plugin.cpp
@@ -160,6 +160,7 @@ void GPUParticles2DEditorPlugin::_generate_visibility_rect() {
 		particles->set_emitting(false);
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Generate Visibility Rect"));
 	undo_redo->add_do_method(particles, "set_visibility_rect", rect);
 	undo_redo->add_undo_method(particles, "set_visibility_rect", particles->get_visibility_rect());
@@ -359,7 +360,6 @@ void GPUParticles2DEditorPlugin::_bind_methods() {
 
 GPUParticles2DEditorPlugin::GPUParticles2DEditorPlugin() {
 	particles = nullptr;
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	toolbar = memnew(HBoxContainer);
 	add_control_to_container(CONTAINER_CANVAS_EDITOR_MENU, toolbar);
diff --git a/editor/plugins/gpu_particles_2d_editor_plugin.h b/editor/plugins/gpu_particles_2d_editor_plugin.h
index 0a0ea21c1f..34cadaf7de 100644
--- a/editor/plugins/gpu_particles_2d_editor_plugin.h
+++ b/editor/plugins/gpu_particles_2d_editor_plugin.h
@@ -40,7 +40,6 @@
 class CheckBox;
 class ConfirmationDialog;
 class EditorFileDialog;
-class EditorUndoRedoManager;
 class MenuButton;
 class OptionButton;
 
@@ -80,7 +79,6 @@ class GPUParticles2DEditorPlugin : public EditorPlugin {
 
 	String source_emission_file;
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	void _file_selected(const String &p_file);
 	void _menu_callback(int p_idx);
 	void _generate_visibility_rect();
diff --git a/editor/plugins/gradient_texture_2d_editor_plugin.cpp b/editor/plugins/gradient_texture_2d_editor_plugin.cpp
index 561dca4fc6..0591288c6e 100644
--- a/editor/plugins/gradient_texture_2d_editor_plugin.cpp
+++ b/editor/plugins/gradient_texture_2d_editor_plugin.cpp
@@ -55,6 +55,7 @@ void GradientTexture2DEditorRect::_update_fill_position() {
 
 	String property_name = handle == HANDLE_FILL_FROM ? "fill_from" : "fill_to";
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(vformat(TTR("Set %s"), property_name), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_property(texture.ptr(), property_name, percent);
 	undo_redo->add_undo_property(texture.ptr(), property_name, handle == HANDLE_FILL_FROM ? texture->get_fill_from() : texture->get_fill_to());
@@ -175,8 +176,6 @@ void GradientTexture2DEditorRect::_notification(int p_what) {
 }
 
 GradientTexture2DEditorRect::GradientTexture2DEditorRect() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	checkerboard = memnew(TextureRect);
 	checkerboard->set_stretch_mode(TextureRect::STRETCH_TILE);
 	checkerboard->set_ignore_texture_size(true);
@@ -189,6 +188,7 @@ GradientTexture2DEditorRect::GradientTexture2DEditorRect() {
 ///////////////////////
 
 void GradientTexture2DEditor::_reverse_button_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Swap GradientTexture2D Fill Points"));
 	undo_redo->add_do_property(texture.ptr(), "fill_from", texture->get_fill_to());
 	undo_redo->add_do_property(texture.ptr(), "fill_to", texture->get_fill_from());
@@ -223,8 +223,6 @@ void GradientTexture2DEditor::_notification(int p_what) {
 }
 
 GradientTexture2DEditor::GradientTexture2DEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	HFlowContainer *toolbar = memnew(HFlowContainer);
 	add_child(toolbar);
 
diff --git a/editor/plugins/gradient_texture_2d_editor_plugin.h b/editor/plugins/gradient_texture_2d_editor_plugin.h
index 0737300498..3172944ea8 100644
--- a/editor/plugins/gradient_texture_2d_editor_plugin.h
+++ b/editor/plugins/gradient_texture_2d_editor_plugin.h
@@ -35,8 +35,6 @@
 #include "editor/editor_plugin.h"
 #include "editor/editor_spin_slider.h"
 
-class EditorUndoRedoManager;
-
 class GradientTexture2DEditorRect : public Control {
 	GDCLASS(GradientTexture2DEditorRect, Control);
 
@@ -47,7 +45,6 @@ class GradientTexture2DEditorRect : public Control {
 	};
 
 	Ref<GradientTexture2D> texture;
-	Ref<EditorUndoRedoManager> undo_redo;
 	bool snap_enabled = false;
 	float snap_size = 0;
 
@@ -77,7 +74,6 @@ class GradientTexture2DEditor : public VBoxContainer {
 	GDCLASS(GradientTexture2DEditor, VBoxContainer);
 
 	Ref<GradientTexture2D> texture;
-	Ref<EditorUndoRedoManager> undo_redo;
 
 	Button *reverse_button = nullptr;
 	Button *snap_button = nullptr;
diff --git a/editor/plugins/mesh_instance_3d_editor_plugin.cpp b/editor/plugins/mesh_instance_3d_editor_plugin.cpp
index d5cdb70ccf..68fbce771a 100644
--- a/editor/plugins/mesh_instance_3d_editor_plugin.cpp
+++ b/editor/plugins/mesh_instance_3d_editor_plugin.cpp
@@ -271,7 +271,7 @@ void MeshInstance3DEditor::_menu_option(int p_option) {
 			outline_dialog->popup_centered(Vector2(200, 90));
 		} break;
 		case MENU_OPTION_CREATE_DEBUG_TANGENTS: {
-			Ref<EditorUndoRedoManager> ur = EditorNode::get_singleton()->get_undo_redo();
+			Ref<EditorUndoRedoManager> &ur = EditorNode::get_singleton()->get_undo_redo();
 			ur->create_action(TTR("Create Debug Tangents"));
 
 			MeshInstance3D *tangents = node->create_debug_tangents_node();
diff --git a/editor/plugins/navigation_link_2d_editor_plugin.cpp b/editor/plugins/navigation_link_2d_editor_plugin.cpp
index b72f639fbf..560454e5d3 100644
--- a/editor/plugins/navigation_link_2d_editor_plugin.cpp
+++ b/editor/plugins/navigation_link_2d_editor_plugin.cpp
@@ -85,6 +85,7 @@ bool NavigationLink2DEditor::forward_canvas_gui_input(const Ref<InputEvent> &p_e
 				end_grabbed = false;
 			}
 		} else {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			if (start_grabbed) {
 				undo_redo->create_action(TTR("Set start_location"));
 				undo_redo->add_do_method(node, "set_start_location", node->get_start_location());
@@ -165,10 +166,6 @@ void NavigationLink2DEditor::edit(NavigationLink2D *p_node) {
 	canvas_item_editor->update_viewport();
 }
 
-NavigationLink2DEditor::NavigationLink2DEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-}
-
 ///////////////////////
 
 void NavigationLink2DEditorPlugin::edit(Object *p_object) {
diff --git a/editor/plugins/navigation_link_2d_editor_plugin.h b/editor/plugins/navigation_link_2d_editor_plugin.h
index 0a3d9b8810..fea9f58a40 100644
--- a/editor/plugins/navigation_link_2d_editor_plugin.h
+++ b/editor/plugins/navigation_link_2d_editor_plugin.h
@@ -35,12 +35,10 @@
 #include "scene/2d/navigation_link_2d.h"
 
 class CanvasItemEditor;
-class EditorUndoRedoManager;
 
 class NavigationLink2DEditor : public Control {
 	GDCLASS(NavigationLink2DEditor, Control);
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	CanvasItemEditor *canvas_item_editor = nullptr;
 	NavigationLink2D *node = nullptr;
 
@@ -58,8 +56,6 @@ public:
 	bool forward_canvas_gui_input(const Ref<InputEvent> &p_event);
 	void forward_canvas_draw_over_viewport(Control *p_overlay);
 	void edit(NavigationLink2D *p_node);
-
-	NavigationLink2DEditor();
 };
 
 class NavigationLink2DEditorPlugin : public EditorPlugin {
diff --git a/editor/plugins/path_2d_editor_plugin.cpp b/editor/plugins/path_2d_editor_plugin.cpp
index c8bd4c1d05..133a7e5327 100644
--- a/editor/plugins/path_2d_editor_plugin.cpp
+++ b/editor/plugins/path_2d_editor_plugin.cpp
@@ -119,6 +119,7 @@ bool Path2DEditor::forward_gui_input(const Ref<InputEvent> &p_event) {
 				}
 
 				// Check for point deletion.
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				if ((mb->get_button_index() == MouseButton::RIGHT && mode == MODE_EDIT) || (mb->get_button_index() == MouseButton::LEFT && mode == MODE_DELETE)) {
 					if (dist_to_p < grab_threshold) {
 						undo_redo->create_action(TTR("Remove Point from Curve"));
@@ -153,6 +154,7 @@ bool Path2DEditor::forward_gui_input(const Ref<InputEvent> &p_event) {
 		if (mb->is_pressed() && mb->get_button_index() == MouseButton::LEFT && ((mb->is_command_or_control_pressed() && mode == MODE_EDIT) || mode == MODE_CREATE)) {
 			Ref<Curve2D> curve = node->get_curve();
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Point to Curve"));
 			undo_redo->add_do_method(curve.ptr(), "add_point", cpoint);
 			undo_redo->add_undo_method(curve.ptr(), "remove_point", curve->get_point_count());
@@ -188,6 +190,7 @@ bool Path2DEditor::forward_gui_input(const Ref<InputEvent> &p_event) {
 				insertion_point = curve->get_point_count() - 2;
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Split Curve"));
 			undo_redo->add_do_method(curve.ptr(), "add_point", xform.affine_inverse().xform(gpoint2), Vector2(0, 0), Vector2(0, 0), insertion_point + 1);
 			undo_redo->add_undo_method(curve.ptr(), "remove_point", insertion_point + 1);
@@ -211,6 +214,7 @@ bool Path2DEditor::forward_gui_input(const Ref<InputEvent> &p_event) {
 		if (!mb->is_pressed() && mb->get_button_index() == MouseButton::LEFT && action != ACTION_NONE) {
 			Ref<Curve2D> curve = node->get_curve();
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			Vector2 new_pos = moving_from + xform.affine_inverse().basis_xform(gpoint - moving_screen_from);
 			switch (action) {
 				case ACTION_NONE:
@@ -486,6 +490,7 @@ void Path2DEditor::_mode_selected(int p_mode) {
 			return;
 		}
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Remove Point from Curve"));
 		undo_redo->add_do_method(node->get_curve().ptr(), "add_point", begin);
 		undo_redo->add_undo_method(node->get_curve().ptr(), "remove_point", node->get_curve()->get_point_count());
@@ -519,7 +524,6 @@ void Path2DEditor::_handle_option_pressed(int p_option) {
 
 Path2DEditor::Path2DEditor() {
 	canvas_item_editor = nullptr;
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
 	mirror_handle_angle = true;
 	mirror_handle_length = true;
 	on_edge = false;
diff --git a/editor/plugins/path_2d_editor_plugin.h b/editor/plugins/path_2d_editor_plugin.h
index d2015b2bb8..c6ed257540 100644
--- a/editor/plugins/path_2d_editor_plugin.h
+++ b/editor/plugins/path_2d_editor_plugin.h
@@ -37,14 +37,11 @@
 #include "scene/gui/separator.h"
 
 class CanvasItemEditor;
-class EditorUndoRedoManager;
 class MenuButton;
 
 class Path2DEditor : public HBoxContainer {
 	GDCLASS(Path2DEditor, HBoxContainer);
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	CanvasItemEditor *canvas_item_editor = nullptr;
 	Panel *panel = nullptr;
 	Path2D *node = nullptr;
diff --git a/editor/plugins/polygon_3d_editor_plugin.cpp b/editor/plugins/polygon_3d_editor_plugin.cpp
index dc6ae6be96..1cd4804b79 100644
--- a/editor/plugins/polygon_3d_editor_plugin.cpp
+++ b/editor/plugins/polygon_3d_editor_plugin.cpp
@@ -95,6 +95,7 @@ void Polygon3DEditor::_menu_option(int p_option) {
 void Polygon3DEditor::_wip_close() {
 	Object *obj = node_resource.is_valid() ? (Object *)node_resource.ptr() : node;
 	ERR_FAIL_COND_MSG(!obj, "Edited object is not valid.");
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Create Polygon3D"));
 	undo_redo->add_undo_method(obj, "set_polygon", obj->call("get_polygon"));
 	undo_redo->add_do_method(obj, "set_polygon", wip);
@@ -184,6 +185,7 @@ EditorPlugin::AfterGUIInput Polygon3DEditor::forward_3d_gui_input(Camera3D *p_ca
 					if (mb->is_pressed()) {
 						if (mb->is_ctrl_pressed()) {
 							if (poly.size() < 3) {
+								Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 								undo_redo->create_action(TTR("Edit Poly"));
 								undo_redo->add_undo_method(obj, "set_polygon", poly);
 								poly.push_back(cpoint);
@@ -262,6 +264,7 @@ EditorPlugin::AfterGUIInput Polygon3DEditor::forward_3d_gui_input(Camera3D *p_ca
 
 							ERR_FAIL_INDEX_V(edited_point, poly.size(), EditorPlugin::AFTER_GUI_INPUT_PASS);
 							poly.write[edited_point] = edited_point_pos;
+							Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 							undo_redo->create_action(TTR("Edit Poly"));
 							undo_redo->add_do_method(obj, "set_polygon", poly);
 							undo_redo->add_undo_method(obj, "set_polygon", pre_move_edit);
@@ -290,6 +293,7 @@ EditorPlugin::AfterGUIInput Polygon3DEditor::forward_3d_gui_input(Camera3D *p_ca
 					}
 
 					if (closest_idx >= 0) {
+						Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 						undo_redo->create_action(TTR("Edit Poly (Remove Point)"));
 						undo_redo->add_undo_method(obj, "set_polygon", poly);
 						poly.remove_at(closest_idx);
@@ -527,7 +531,6 @@ void Polygon3DEditor::_bind_methods() {
 
 Polygon3DEditor::Polygon3DEditor() {
 	node = nullptr;
-	undo_redo = EditorNode::get_undo_redo();
 
 	add_child(memnew(VSeparator));
 	button_create = memnew(Button);
diff --git a/editor/plugins/polygon_3d_editor_plugin.h b/editor/plugins/polygon_3d_editor_plugin.h
index fe8e2ce36d..2fa9820aa6 100644
--- a/editor/plugins/polygon_3d_editor_plugin.h
+++ b/editor/plugins/polygon_3d_editor_plugin.h
@@ -38,13 +38,11 @@
 #include "scene/resources/immediate_mesh.h"
 
 class CanvasItemEditor;
-class EditorUndoRedoManager;
 class MenuButton;
 
 class Polygon3DEditor : public HBoxContainer {
 	GDCLASS(Polygon3DEditor, HBoxContainer);
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	enum Mode {
 		MODE_CREATE,
 		MODE_EDIT,
diff --git a/editor/plugins/resource_preloader_editor_plugin.cpp b/editor/plugins/resource_preloader_editor_plugin.cpp
index e35e794b24..2794b02ba6 100644
--- a/editor/plugins/resource_preloader_editor_plugin.cpp
+++ b/editor/plugins/resource_preloader_editor_plugin.cpp
@@ -71,6 +71,7 @@ void ResourcePreloaderEditor::_files_load_request(const Vector<String> &p_paths)
 			name = basename + " " + itos(counter);
 		}
 
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Add Resource"));
 		undo_redo->add_do_method(preloader, "add_resource", name, resource);
 		undo_redo->add_undo_method(preloader, "remove_resource", name);
@@ -115,6 +116,7 @@ void ResourcePreloaderEditor::_item_edited() {
 		}
 
 		Ref<Resource> samp = preloader->get_resource(old_name);
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Rename Resource"));
 		undo_redo->add_do_method(preloader, "remove_resource", old_name);
 		undo_redo->add_do_method(preloader, "add_resource", new_name, samp);
@@ -127,6 +129,7 @@ void ResourcePreloaderEditor::_item_edited() {
 }
 
 void ResourcePreloaderEditor::_remove_resource(const String &p_to_remove) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Resource"));
 	undo_redo->add_do_method(preloader, "remove_resource", p_to_remove);
 	undo_redo->add_undo_method(preloader, "add_resource", p_to_remove, preloader->get_resource(p_to_remove));
@@ -160,6 +163,7 @@ void ResourcePreloaderEditor::_paste_pressed() {
 		name = basename + " " + itos(counter);
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Paste Resource"));
 	undo_redo->add_do_method(preloader, "add_resource", name, r);
 	undo_redo->add_undo_method(preloader, "remove_resource", name);
@@ -235,10 +239,6 @@ void ResourcePreloaderEditor::_cell_button_pressed(Object *p_item, int p_column,
 	}
 }
 
-void ResourcePreloaderEditor::set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo) {
-	undo_redo = p_undo_redo;
-}
-
 void ResourcePreloaderEditor::edit(ResourcePreloader *p_preloader) {
 	preloader = p_preloader;
 
@@ -322,6 +322,7 @@ void ResourcePreloaderEditor::drop_data_fw(const Point2 &p_point, const Variant
 				name = basename + "_" + itos(counter);
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Resource"));
 			undo_redo->add_do_method(preloader, "add_resource", name, r);
 			undo_redo->add_undo_method(preloader, "remove_resource", name);
@@ -392,7 +393,6 @@ ResourcePreloaderEditor::ResourcePreloaderEditor() {
 }
 
 void ResourcePreloaderEditorPlugin::edit(Object *p_object) {
-	preloader_editor->set_undo_redo(EditorNode::get_undo_redo());
 	ResourcePreloader *s = Object::cast_to<ResourcePreloader>(p_object);
 	if (!s) {
 		return;
diff --git a/editor/plugins/resource_preloader_editor_plugin.h b/editor/plugins/resource_preloader_editor_plugin.h
index 59641e2561..7c1be9114d 100644
--- a/editor/plugins/resource_preloader_editor_plugin.h
+++ b/editor/plugins/resource_preloader_editor_plugin.h
@@ -38,7 +38,6 @@
 #include "scene/main/resource_preloader.h"
 
 class EditorFileDialog;
-class EditorUndoRedoManager;
 
 class ResourcePreloaderEditor : public PanelContainer {
 	GDCLASS(ResourcePreloaderEditor, PanelContainer);
@@ -68,8 +67,6 @@ class ResourcePreloaderEditor : public PanelContainer {
 	void _cell_button_pressed(Object *p_item, int p_column, int p_id, MouseButton p_button);
 	void _item_edited();
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	Variant get_drag_data_fw(const Point2 &p_point, Control *p_from);
 	bool can_drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from) const;
 	void drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from);
@@ -80,8 +77,6 @@ protected:
 	static void _bind_methods();
 
 public:
-	void set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo);
-
 	void edit(ResourcePreloader *p_preloader);
 	ResourcePreloaderEditor();
 };
diff --git a/editor/plugins/skeleton_3d_editor_plugin.cpp b/editor/plugins/skeleton_3d_editor_plugin.cpp
index 2a05f95321..8e80d0d5d8 100644
--- a/editor/plugins/skeleton_3d_editor_plugin.cpp
+++ b/editor/plugins/skeleton_3d_editor_plugin.cpp
@@ -113,6 +113,7 @@ void BoneTransformEditor::_value_changed(const String &p_property, Variant p_val
 		return;
 	}
 	if (skeleton) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Set Bone Transform"), UndoRedo::MERGE_ENDS);
 		undo_redo->add_undo_property(skeleton, p_property, skeleton->get(p_property));
 		undo_redo->add_do_property(skeleton, p_property, p_value);
@@ -122,7 +123,6 @@ void BoneTransformEditor::_value_changed(const String &p_property, Variant p_val
 
 BoneTransformEditor::BoneTransformEditor(Skeleton3D *p_skeleton) :
 		skeleton(p_skeleton) {
-	undo_redo = EditorNode::get_undo_redo();
 }
 
 void BoneTransformEditor::set_keyable(const bool p_keyable) {
diff --git a/editor/plugins/skeleton_3d_editor_plugin.h b/editor/plugins/skeleton_3d_editor_plugin.h
index 9f02d144ed..273bdeaac8 100644
--- a/editor/plugins/skeleton_3d_editor_plugin.h
+++ b/editor/plugins/skeleton_3d_editor_plugin.h
@@ -41,7 +41,6 @@
 #include "scene/resources/immediate_mesh.h"
 
 class EditorInspectorPluginSkeleton;
-class EditorUndoRedoManager;
 class Joint;
 class PhysicalBone3D;
 class Skeleton3DEditorPlugin;
@@ -65,8 +64,6 @@ class BoneTransformEditor : public VBoxContainer {
 	Skeleton3D *skeleton = nullptr;
 	// String property;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	bool toggle_enabled = false;
 	bool updating = false;
 
diff --git a/editor/plugins/sprite_frames_editor_plugin.cpp b/editor/plugins/sprite_frames_editor_plugin.cpp
index cf8cc71db7..6924cb58bf 100644
--- a/editor/plugins/sprite_frames_editor_plugin.cpp
+++ b/editor/plugins/sprite_frames_editor_plugin.cpp
@@ -249,6 +249,7 @@ void SpriteFramesEditor::_sheet_add_frames() {
 	const Size2i offset = _get_offset();
 	const Size2i separation = _get_separation();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Frame"));
 
 	int fc = frames->get_frame_count(edited_anim);
@@ -467,6 +468,7 @@ void SpriteFramesEditor::_file_load_request(const Vector<String> &p_path, int p_
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Frame"));
 	int fc = frames->get_frame_count(edited_anim);
 
@@ -527,6 +529,7 @@ void SpriteFramesEditor::_paste_pressed() {
 		return; ///beh should show an error i guess
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Paste Frame"));
 	undo_redo->add_do_method(frames, "add_frame", edited_anim, r);
 	undo_redo->add_undo_method(frames, "remove_frame", edited_anim, frames->get_frame_count(edited_anim));
@@ -564,6 +567,7 @@ void SpriteFramesEditor::_empty_pressed() {
 
 	Ref<Texture2D> r;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Empty"));
 	undo_redo->add_do_method(frames, "add_frame", edited_anim, r, from);
 	undo_redo->add_undo_method(frames, "remove_frame", edited_anim, from);
@@ -587,6 +591,7 @@ void SpriteFramesEditor::_empty2_pressed() {
 
 	Ref<Texture2D> r;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Empty"));
 	undo_redo->add_do_method(frames, "add_frame", edited_anim, r, from + 1);
 	undo_redo->add_undo_method(frames, "remove_frame", edited_anim, from + 1);
@@ -610,6 +615,7 @@ void SpriteFramesEditor::_up_pressed() {
 	sel = to_move;
 	sel -= 1;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Resource"));
 	undo_redo->add_do_method(frames, "set_frame", edited_anim, to_move, frames->get_frame(edited_anim, to_move - 1));
 	undo_redo->add_do_method(frames, "set_frame", edited_anim, to_move - 1, frames->get_frame(edited_anim, to_move));
@@ -635,6 +641,7 @@ void SpriteFramesEditor::_down_pressed() {
 	sel = to_move;
 	sel += 1;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Resource"));
 	undo_redo->add_do_method(frames, "set_frame", edited_anim, to_move, frames->get_frame(edited_anim, to_move + 1));
 	undo_redo->add_do_method(frames, "set_frame", edited_anim, to_move + 1, frames->get_frame(edited_anim, to_move));
@@ -657,6 +664,7 @@ void SpriteFramesEditor::_delete_pressed() {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete Resource"));
 	undo_redo->add_do_method(frames, "remove_frame", edited_anim, to_delete);
 	undo_redo->add_undo_method(frames, "add_frame", edited_anim, frames->get_frame(edited_anim, to_delete), to_delete);
@@ -743,6 +751,7 @@ void SpriteFramesEditor::_animation_name_edited() {
 	List<Node *> nodes;
 	_find_anim_sprites(EditorNode::get_singleton()->get_edited_scene(), &nodes, Ref<SpriteFrames>(frames));
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Rename Animation"));
 	undo_redo->add_do_method(frames, "rename_animation", edited_anim, name);
 	undo_redo->add_undo_method(frames, "rename_animation", name, edited_anim);
@@ -772,6 +781,7 @@ void SpriteFramesEditor::_animation_add() {
 	List<Node *> nodes;
 	_find_anim_sprites(EditorNode::get_singleton()->get_edited_scene(), &nodes, Ref<SpriteFrames>(frames));
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Animation"));
 	undo_redo->add_do_method(frames, "add_animation", name);
 	undo_redo->add_undo_method(frames, "remove_animation", name);
@@ -804,6 +814,7 @@ void SpriteFramesEditor::_animation_remove() {
 }
 
 void SpriteFramesEditor::_animation_remove_confirmed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Remove Animation"));
 	undo_redo->add_do_method(frames, "remove_animation", edited_anim);
 	undo_redo->add_undo_method(frames, "add_animation", edited_anim);
@@ -831,6 +842,7 @@ void SpriteFramesEditor::_animation_loop_changed() {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Animation Loop"));
 	undo_redo->add_do_method(frames, "set_animation_loop", edited_anim, anim_loop->is_pressed());
 	undo_redo->add_undo_method(frames, "set_animation_loop", edited_anim, frames->get_animation_loop(edited_anim));
@@ -844,6 +856,7 @@ void SpriteFramesEditor::_animation_fps_changed(double p_value) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Animation FPS"), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_method(frames, "set_animation_speed", edited_anim, p_value);
 	undo_redo->add_undo_method(frames, "set_animation_speed", edited_anim, frames->get_animation_speed(edited_anim));
@@ -1035,10 +1048,6 @@ void SpriteFramesEditor::edit(SpriteFrames *p_frames) {
 	delete_frame->set_disabled(read_only);
 }
 
-void SpriteFramesEditor::set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo) {
-	undo_redo = p_undo_redo;
-}
-
 Variant SpriteFramesEditor::get_drag_data_fw(const Point2 &p_point, Control *p_from) {
 	if (read_only) {
 		return false;
@@ -1136,6 +1145,7 @@ void SpriteFramesEditor::drop_data_fw(const Point2 &p_point, const Variant &p_da
 				reorder = true;
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			if (reorder) { //drop is from reordering frames
 				int from_frame = -1;
 				if (d.has("frame")) {
@@ -1530,8 +1540,6 @@ SpriteFramesEditor::SpriteFramesEditor() {
 }
 
 void SpriteFramesEditorPlugin::edit(Object *p_object) {
-	frames_editor->set_undo_redo(get_undo_redo());
-
 	SpriteFrames *s;
 	AnimatedSprite2D *animated_sprite = Object::cast_to<AnimatedSprite2D>(p_object);
 	if (animated_sprite) {
diff --git a/editor/plugins/sprite_frames_editor_plugin.h b/editor/plugins/sprite_frames_editor_plugin.h
index 5fc3fe4481..64245ee1e0 100644
--- a/editor/plugins/sprite_frames_editor_plugin.h
+++ b/editor/plugins/sprite_frames_editor_plugin.h
@@ -45,7 +45,6 @@
 #include "scene/gui/tree.h"
 
 class EditorFileDialog;
-class EditorUndoRedoManager;
 
 class SpriteFramesEditor : public HSplitContainer {
 	GDCLASS(SpriteFramesEditor, HSplitContainer);
@@ -154,8 +153,6 @@ class SpriteFramesEditor : public HSplitContainer {
 	bool updating;
 	bool updating_split_settings = false; // Skip SpinBox/Range callback when setting value by code.
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	Variant get_drag_data_fw(const Point2 &p_point, Control *p_from);
 	bool can_drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from) const;
 	void drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from);
@@ -179,8 +176,6 @@ protected:
 	static void _bind_methods();
 
 public:
-	void set_undo_redo(Ref<EditorUndoRedoManager> p_undo_redo);
-
 	void edit(SpriteFrames *p_frames);
 	SpriteFramesEditor();
 };
diff --git a/editor/plugins/texture_region_editor_plugin.cpp b/editor/plugins/texture_region_editor_plugin.cpp
index b0c8597adf..2e84a4fac9 100644
--- a/editor/plugins/texture_region_editor_plugin.cpp
+++ b/editor/plugins/texture_region_editor_plugin.cpp
@@ -298,6 +298,7 @@ void TextureRegionEditor::_region_input(const Ref<InputEvent> &p_input) {
 		mtx.xform(rect.position + Vector2(0, rect.size.y / 2)) + Vector2(-handle_offset, 0)
 	};
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	Ref<InputEventMouseButton> mb = p_input;
 	if (mb.is_valid()) {
 		if (mb->get_button_index() == MouseButton::LEFT) {
@@ -1065,7 +1066,6 @@ TextureRegionEditor::TextureRegionEditor() {
 	node_ninepatch = nullptr;
 	obj_styleBox = Ref<StyleBoxTexture>(nullptr);
 	atlas_tex = Ref<AtlasTexture>(nullptr);
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	preview_tex = Ref<CanvasTexture>(memnew(CanvasTexture));
 
diff --git a/editor/plugins/texture_region_editor_plugin.h b/editor/plugins/texture_region_editor_plugin.h
index 48cbb6b70e..310de19cc5 100644
--- a/editor/plugins/texture_region_editor_plugin.h
+++ b/editor/plugins/texture_region_editor_plugin.h
@@ -41,7 +41,6 @@
 #include "scene/resources/texture.h"
 
 class ViewPanner;
-class EditorUndoRedoManager;
 class OptionButton;
 
 class TextureRegionEditor : public AcceptDialog {
@@ -71,8 +70,6 @@ class TextureRegionEditor : public AcceptDialog {
 	VScrollBar *vscroll = nullptr;
 	HScrollBar *hscroll = nullptr;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	Vector2 draw_ofs;
 	float draw_zoom = 0.0;
 	bool updating_scroll = false;
diff --git a/editor/plugins/tiles/atlas_merging_dialog.cpp b/editor/plugins/tiles/atlas_merging_dialog.cpp
index 167c6d169b..e266d26b73 100644
--- a/editor/plugins/tiles/atlas_merging_dialog.cpp
+++ b/editor/plugins/tiles/atlas_merging_dialog.cpp
@@ -154,6 +154,7 @@ void AtlasMergingDialog::_merge_confirmed(String p_path) {
 	Ref<Texture2D> new_texture_resource = ResourceLoader::load(p_path, "Texture2D");
 	merged->set_texture(new_texture_resource);
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Merge TileSetAtlasSource"));
 	int next_id = tile_set->get_next_source_id();
 	undo_redo->add_do_method(*tile_set, "add_source", merged, next_id);
@@ -193,6 +194,7 @@ void AtlasMergingDialog::ok_pressed() {
 }
 
 void AtlasMergingDialog::cancel_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (int i = 0; i < commited_actions_count; i++) {
 		undo_redo->undo();
 	}
@@ -248,8 +250,6 @@ void AtlasMergingDialog::update_tile_set(Ref<TileSet> p_tile_set) {
 }
 
 AtlasMergingDialog::AtlasMergingDialog() {
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
-
 	// Atlas merging window.
 	set_title(TTR("Atlas Merging"));
 	set_hide_on_ok(false);
diff --git a/editor/plugins/tiles/atlas_merging_dialog.h b/editor/plugins/tiles/atlas_merging_dialog.h
index c7e4635d16..228c188817 100644
--- a/editor/plugins/tiles/atlas_merging_dialog.h
+++ b/editor/plugins/tiles/atlas_merging_dialog.h
@@ -38,7 +38,6 @@
 #include "scene/resources/tile_set.h"
 
 class EditorFileDialog;
-class EditorUndoRedoManager;
 
 class AtlasMergingDialog : public ConfirmationDialog {
 	GDCLASS(AtlasMergingDialog, ConfirmationDialog);
@@ -50,8 +49,6 @@ private:
 	LocalVector<HashMap<Vector2i, Vector2i>> merged_mapping;
 	Ref<TileSet> tile_set;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	// Settings.
 	int next_line_after_column = 30;
 
diff --git a/editor/plugins/tiles/tile_data_editors.cpp b/editor/plugins/tiles/tile_data_editors.cpp
index b3fef6d4b4..44b8ff05d1 100644
--- a/editor/plugins/tiles/tile_data_editors.cpp
+++ b/editor/plugins/tiles/tile_data_editors.cpp
@@ -262,7 +262,7 @@ void GenericTilePolygonEditor::_zoom_changed() {
 void GenericTilePolygonEditor::_advanced_menu_item_pressed(int p_item_pressed) {
 	Ref<EditorUndoRedoManager> undo_redo;
 	if (use_undo_redo) {
-		undo_redo = editor_undo_redo;
+		undo_redo = EditorNode::get_undo_redo();
 	} else {
 		// This nice hack allows for discarding undo actions without making code too complex.
 		undo_redo.instantiate();
@@ -427,7 +427,7 @@ void GenericTilePolygonEditor::_snap_to_half_pixel(Point2 &r_point) {
 void GenericTilePolygonEditor::_base_control_gui_input(Ref<InputEvent> p_event) {
 	Ref<EditorUndoRedoManager> undo_redo;
 	if (use_undo_redo) {
-		undo_redo = editor_undo_redo;
+		undo_redo = EditorNode::get_undo_redo();
 	} else {
 		// This nice hack allows for discarding undo actions without making code too complex.
 		undo_redo.instantiate();
@@ -763,8 +763,6 @@ void GenericTilePolygonEditor::_bind_methods() {
 }
 
 GenericTilePolygonEditor::GenericTilePolygonEditor() {
-	editor_undo_redo = EditorNode::get_undo_redo();
-
 	toolbar = memnew(HBoxContainer);
 	add_child(toolbar);
 
@@ -883,6 +881,7 @@ Variant TileDataDefaultEditor::_get_value(TileSetAtlasSource *p_tile_set_atlas_s
 }
 
 void TileDataDefaultEditor::_setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (const KeyValue<TileMapCell, Variant> &E : p_previous_values) {
 		Vector2i coords = E.key.get_atlas_coords();
 		undo_redo->add_undo_property(p_tile_set_atlas_source, vformat("%d:%d/%d/%s", coords.x, coords.y, E.key.alternative_tile, property), E.value);
@@ -951,6 +950,7 @@ void TileDataDefaultEditor::forward_painting_atlas_gui_input(TileAtlasView *p_ti
 		}
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	Ref<InputEventMouseButton> mb = p_event;
 	if (mb.is_valid()) {
 		if (mb->get_button_index() == MouseButton::LEFT) {
@@ -1074,6 +1074,7 @@ void TileDataDefaultEditor::forward_painting_alternatives_gui_input(TileAtlasVie
 					drag_last_pos = mb->get_position();
 				}
 			} else {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Painting Tiles Property"));
 				_setup_undo_redo_action(p_tile_set_atlas_source, drag_modified, drag_painted_value);
 				undo_redo->commit_action(false);
@@ -1195,8 +1196,6 @@ Variant::Type TileDataDefaultEditor::get_property_type() {
 }
 
 TileDataDefaultEditor::TileDataDefaultEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	label = memnew(Label);
 	label->set_text(TTR("Painting:"));
 	label->set_theme_type_variation("HeaderSmall");
@@ -1327,6 +1326,7 @@ Variant TileDataOcclusionShapeEditor::_get_value(TileSetAtlasSource *p_tile_set_
 }
 
 void TileDataOcclusionShapeEditor::_setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (const KeyValue<TileMapCell, Variant> &E : p_previous_values) {
 		Vector2i coords = E.key.get_atlas_coords();
 		undo_redo->add_undo_property(p_tile_set_atlas_source, vformat("%d:%d/%d/occlusion_layer_%d/polygon", coords.x, coords.y, E.key.alternative_tile, occlusion_layer), E.value);
@@ -1347,8 +1347,6 @@ void TileDataOcclusionShapeEditor::_notification(int p_what) {
 }
 
 TileDataOcclusionShapeEditor::TileDataOcclusionShapeEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	polygon_editor = memnew(GenericTilePolygonEditor);
 	add_child(polygon_editor);
 }
@@ -1508,6 +1506,7 @@ Variant TileDataCollisionEditor::_get_value(TileSetAtlasSource *p_tile_set_atlas
 
 void TileDataCollisionEditor::_setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) {
 	Array new_array = p_new_value;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (KeyValue<TileMapCell, Variant> &E : p_previous_values) {
 		Array old_array = E.value;
 
@@ -1544,8 +1543,6 @@ void TileDataCollisionEditor::_notification(int p_what) {
 }
 
 TileDataCollisionEditor::TileDataCollisionEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	polygon_editor = memnew(GenericTilePolygonEditor);
 	polygon_editor->set_multiple_polygon_mode(true);
 	polygon_editor->connect("polygons_changed", callable_mp(this, &TileDataCollisionEditor::_polygons_changed));
@@ -2192,6 +2189,7 @@ void TileDataTerrainsEditor::forward_painting_atlas_gui_input(TileAtlasView *p_t
 					}
 				}
 			} else {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				if (drag_type == DRAG_TYPE_PAINT_TERRAIN_SET_RECT) {
 					Rect2i rect;
 					rect.set_position(p_tile_atlas_view->get_atlas_tile_coords_at_pos(drag_start_pos));
@@ -2564,6 +2562,7 @@ void TileDataTerrainsEditor::forward_painting_alternatives_gui_input(TileAtlasVi
 					}
 				}
 			} else {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				if (drag_type == DRAG_TYPE_PAINT_TERRAIN_SET) {
 					undo_redo->create_action(TTR("Painting Tiles Property"));
 					for (KeyValue<TileMapCell, Variant> &E : drag_modified) {
@@ -2629,8 +2628,6 @@ void TileDataTerrainsEditor::_notification(int p_what) {
 }
 
 TileDataTerrainsEditor::TileDataTerrainsEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	label = memnew(Label);
 	label->set_text(TTR("Painting:"));
 	label->set_theme_type_variation("HeaderSmall");
@@ -2714,6 +2711,7 @@ Variant TileDataNavigationEditor::_get_value(TileSetAtlasSource *p_tile_set_atla
 }
 
 void TileDataNavigationEditor::_setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (const KeyValue<TileMapCell, Variant> &E : p_previous_values) {
 		Vector2i coords = E.key.get_atlas_coords();
 		undo_redo->add_undo_property(p_tile_set_atlas_source, vformat("%d:%d/%d/navigation_layer_%d/polygon", coords.x, coords.y, E.key.alternative_tile, navigation_layer), E.value);
@@ -2736,8 +2734,6 @@ void TileDataNavigationEditor::_notification(int p_what) {
 }
 
 TileDataNavigationEditor::TileDataNavigationEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	polygon_editor = memnew(GenericTilePolygonEditor);
 	polygon_editor->set_multiple_polygon_mode(true);
 	add_child(polygon_editor);
diff --git a/editor/plugins/tiles/tile_data_editors.h b/editor/plugins/tiles/tile_data_editors.h
index 0a947fce8b..98d337b4cf 100644
--- a/editor/plugins/tiles/tile_data_editors.h
+++ b/editor/plugins/tiles/tile_data_editors.h
@@ -39,8 +39,6 @@
 #include "scene/gui/control.h"
 #include "scene/gui/label.h"
 
-class EditorUndoRedoManager;
-
 class TileDataEditor : public VBoxContainer {
 	GDCLASS(TileDataEditor, VBoxContainer);
 
@@ -95,7 +93,6 @@ private:
 	bool multiple_polygon_mode = false;
 
 	bool use_undo_redo = true;
-	Ref<EditorUndoRedoManager> editor_undo_redo;
 
 	// UI
 	int hovered_polygon_index = -1;
@@ -216,8 +213,6 @@ private:
 protected:
 	DummyObject *dummy_object = memnew(DummyObject);
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	StringName type;
 	String property;
 	Variant::Type property_type;
@@ -283,8 +278,6 @@ private:
 	virtual void _setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) override;
 
 protected:
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	virtual void _tile_set_changed() override;
 
 	void _notification(int p_what);
@@ -318,8 +311,6 @@ class TileDataCollisionEditor : public TileDataDefaultEditor {
 	virtual void _setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) override;
 
 protected:
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	virtual void _tile_set_changed() override;
 
 	void _notification(int p_what);
@@ -370,8 +361,6 @@ protected:
 
 	void _notification(int p_what);
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 public:
 	virtual Control *get_toolbar() override { return toolbar; };
 	virtual void forward_draw_over_atlas(TileAtlasView *p_tile_atlas_view, TileSetAtlasSource *p_tile_atlas_source, CanvasItem *p_canvas_item, Transform2D p_transform) override;
@@ -403,8 +392,6 @@ private:
 	virtual void _setup_undo_redo_action(TileSetAtlasSource *p_tile_set_atlas_source, HashMap<TileMapCell, Variant, TileMapCell> p_previous_values, Variant p_new_value) override;
 
 protected:
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	virtual void _tile_set_changed() override;
 
 	void _notification(int p_what);
diff --git a/editor/plugins/tiles/tile_map_editor.cpp b/editor/plugins/tiles/tile_map_editor.cpp
index 93f9df4d6e..eb7c93872e 100644
--- a/editor/plugins/tiles/tile_map_editor.cpp
+++ b/editor/plugins/tiles/tile_map_editor.cpp
@@ -272,6 +272,7 @@ void TileMapEditorTilesPlugin::_patterns_item_list_gui_input(const Ref<InputEven
 	if (ED_IS_SHORTCUT("tiles_editor/paste", p_event) && p_event->is_pressed() && !p_event->is_echo()) {
 		select_last_pattern = true;
 		int new_pattern_index = tile_set->get_patterns_count();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Add TileSet pattern"));
 		undo_redo->add_do_method(*tile_set, "add_pattern", tile_map_clipboard, new_pattern_index);
 		undo_redo->add_undo_method(*tile_set, "remove_pattern", new_pattern_index);
@@ -281,6 +282,7 @@ void TileMapEditorTilesPlugin::_patterns_item_list_gui_input(const Ref<InputEven
 
 	if (ED_IS_SHORTCUT("tiles_editor/delete", p_event) && p_event->is_pressed() && !p_event->is_echo()) {
 		Vector<int> selected = patterns_item_list->get_selected_items();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Remove TileSet patterns"));
 		for (int i = 0; i < selected.size(); i++) {
 			int pattern_index = selected[i];
@@ -511,6 +513,7 @@ bool TileMapEditorTilesPlugin::forward_canvas_gui_input(const Ref<InputEvent> &p
 		if (ED_IS_SHORTCUT("tiles_editor/cut", p_event)) {
 			// Delete selected tiles.
 			if (!tile_map_selection.is_empty()) {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Delete tiles"));
 				for (const Vector2i &E : tile_map_selection) {
 					undo_redo->add_do_method(tile_map, "set_cell", tile_map_layer, E, TileSet::INVALID_SOURCE, TileSetSource::INVALID_ATLAS_COORDS, TileSetSource::INVALID_TILE_ALTERNATIVE);
@@ -542,6 +545,7 @@ bool TileMapEditorTilesPlugin::forward_canvas_gui_input(const Ref<InputEvent> &p
 	if (ED_IS_SHORTCUT("tiles_editor/delete", p_event)) {
 		// Delete selected tiles.
 		if (!tile_map_selection.is_empty()) {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Delete tiles"));
 			for (const Vector2i &E : tile_map_selection) {
 				undo_redo->add_do_method(tile_map, "set_cell", tile_map_layer, E, TileSet::INVALID_SOURCE, TileSetSource::INVALID_ATLAS_COORDS, TileSetSource::INVALID_TILE_ALTERNATIVE);
@@ -1233,6 +1237,7 @@ void TileMapEditorTilesPlugin::_stop_dragging() {
 	Transform2D xform = CanvasItemEditor::get_singleton()->get_canvas_transform() * tile_map->get_global_transform();
 	Vector2 mpos = xform.affine_inverse().xform(CanvasItemEditor::get_singleton()->get_viewport_control()->get_local_mouse_position());
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	switch (drag_type) {
 		case DRAG_TYPE_SELECT: {
 			undo_redo->create_action(TTR("Change selection"));
@@ -2012,8 +2017,6 @@ void TileMapEditorTilesPlugin::_bind_methods() {
 }
 
 TileMapEditorTilesPlugin::TileMapEditorTilesPlugin() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	CanvasItemEditor::get_singleton()
 			->get_viewport_control()
 			->connect("mouse_exited", callable_mp(this, &TileMapEditorTilesPlugin::_mouse_exited_viewport));
@@ -2634,6 +2637,7 @@ void TileMapEditorTerrainsPlugin::_stop_dragging() {
 	Transform2D xform = CanvasItemEditor::get_singleton()->get_canvas_transform() * tile_map->get_global_transform();
 	Vector2 mpos = xform.affine_inverse().xform(CanvasItemEditor::get_singleton()->get_viewport_control()->get_local_mouse_position());
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	switch (drag_type) {
 		case DRAG_TYPE_PICK: {
 			Vector2i coords = tile_map->local_to_map(mpos);
@@ -3304,8 +3308,6 @@ void TileMapEditorTerrainsPlugin::edit(ObjectID p_tile_map_id, int p_tile_map_la
 }
 
 TileMapEditorTerrainsPlugin::TileMapEditorTerrainsPlugin() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	main_vbox_container = memnew(VBoxContainer);
 	main_vbox_container->connect("tree_entered", callable_mp(this, &TileMapEditorTerrainsPlugin::_update_theme));
 	main_vbox_container->connect("theme_changed", callable_mp(this, &TileMapEditorTerrainsPlugin::_update_theme));
@@ -3479,6 +3481,7 @@ void TileMapEditor::_advanced_menu_button_id_pressed(int p_id) {
 	}
 
 	if (p_id == 0) { // Replace Tile Proxies
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Replace Tiles with Proxies"));
 		for (int layer_index = 0; layer_index < tile_map->get_layers_count(); layer_index++) {
 			TypedArray<Vector2i> used_cells = tile_map->get_used_cells(layer_index);
@@ -3952,8 +3955,6 @@ void TileMapEditor::edit(TileMap *p_tile_map) {
 }
 
 TileMapEditor::TileMapEditor() {
-	undo_redo = EditorNode::get_undo_redo();
-
 	set_process_internal(true);
 
 	// Shortcuts.
diff --git a/editor/plugins/tiles/tile_map_editor.h b/editor/plugins/tiles/tile_map_editor.h
index 9a47d8bbc4..ad27795437 100644
--- a/editor/plugins/tiles/tile_map_editor.h
+++ b/editor/plugins/tiles/tile_map_editor.h
@@ -47,8 +47,6 @@
 #include "scene/gui/tab_bar.h"
 #include "scene/gui/tree.h"
 
-class EditorUndoRedoManager;
-
 class TileMapEditorPlugin : public Object {
 public:
 	struct TabData {
@@ -70,7 +68,6 @@ class TileMapEditorTilesPlugin : public TileMapEditorPlugin {
 	GDCLASS(TileMapEditorTilesPlugin, TileMapEditorPlugin);
 
 private:
-	Ref<EditorUndoRedoManager> undo_redo;
 	ObjectID tile_map_id;
 	int tile_map_layer = -1;
 	virtual void edit(ObjectID p_tile_map_id, int p_tile_map_layer) override;
@@ -223,7 +220,6 @@ class TileMapEditorTerrainsPlugin : public TileMapEditorPlugin {
 	GDCLASS(TileMapEditorTerrainsPlugin, TileMapEditorPlugin);
 
 private:
-	Ref<EditorUndoRedoManager> undo_redo;
 	ObjectID tile_map_id;
 	int tile_map_layer = -1;
 	virtual void edit(ObjectID p_tile_map_id, int p_tile_map_layer) override;
@@ -317,7 +313,6 @@ class TileMapEditor : public VBoxContainer {
 	GDCLASS(TileMapEditor, VBoxContainer);
 
 private:
-	Ref<EditorUndoRedoManager> undo_redo;
 	bool tileset_changed_needs_update = false;
 	ObjectID tile_map_id;
 	int tile_map_layer = -1;
diff --git a/editor/plugins/tiles/tile_proxies_manager_dialog.cpp b/editor/plugins/tiles/tile_proxies_manager_dialog.cpp
index 9e4c29fa79..50350de97e 100644
--- a/editor/plugins/tiles/tile_proxies_manager_dialog.cpp
+++ b/editor/plugins/tiles/tile_proxies_manager_dialog.cpp
@@ -53,6 +53,7 @@ void TileProxiesManagerDialog::_menu_id_pressed(int p_id) {
 }
 
 void TileProxiesManagerDialog::_delete_selected_bindings() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Remove Tile Proxies"));
 
 	Vector<int> source_level_selected = source_level_list->get_selected_items();
@@ -152,6 +153,7 @@ void TileProxiesManagerDialog::_property_changed(const String &p_path, const Var
 }
 
 void TileProxiesManagerDialog::_add_button_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (from.source_id != TileSet::INVALID_SOURCE && to.source_id != TileSet::INVALID_SOURCE) {
 		Vector2i from_coords = from.get_atlas_coords();
 		Vector2i to_coords = to.get_atlas_coords();
@@ -192,6 +194,7 @@ void TileProxiesManagerDialog::_add_button_pressed() {
 }
 
 void TileProxiesManagerDialog::_clear_invalid_button_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete All Invalid Tile Proxies"));
 
 	undo_redo->add_do_method(*tile_set, "cleanup_invalid_tile_proxies");
@@ -219,6 +222,7 @@ void TileProxiesManagerDialog::_clear_invalid_button_pressed() {
 }
 
 void TileProxiesManagerDialog::_clear_all_button_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete All Tile Proxies"));
 
 	undo_redo->add_do_method(*tile_set, "clear_tile_proxies");
@@ -299,6 +303,7 @@ void TileProxiesManagerDialog::_unhandled_key_input(Ref<InputEvent> p_event) {
 }
 
 void TileProxiesManagerDialog::cancel_pressed() {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (int i = 0; i < commited_actions_count; i++) {
 		undo_redo->undo();
 	}
@@ -318,8 +323,6 @@ void TileProxiesManagerDialog::update_tile_set(Ref<TileSet> p_tile_set) {
 }
 
 TileProxiesManagerDialog::TileProxiesManagerDialog() {
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
-
 	// Tile proxy management window.
 	set_title(TTR("Tile Proxies Management"));
 	set_process_unhandled_key_input(true);
diff --git a/editor/plugins/tiles/tile_proxies_manager_dialog.h b/editor/plugins/tiles/tile_proxies_manager_dialog.h
index 511e442a10..d0f1933882 100644
--- a/editor/plugins/tiles/tile_proxies_manager_dialog.h
+++ b/editor/plugins/tiles/tile_proxies_manager_dialog.h
@@ -43,8 +43,6 @@ private:
 	int commited_actions_count = 0;
 	Ref<TileSet> tile_set;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	TileMapCell from;
 	TileMapCell to;
 
diff --git a/editor/plugins/tiles/tile_set_editor.cpp b/editor/plugins/tiles/tile_set_editor.cpp
index eaae9555dc..dccff80512 100644
--- a/editor/plugins/tiles/tile_set_editor.cpp
+++ b/editor/plugins/tiles/tile_set_editor.cpp
@@ -66,6 +66,7 @@ void TileSetEditor::_drop_data_fw(const Point2 &p_point, const Variant &p_data,
 				// Actually create the new source.
 				Ref<TileSetAtlasSource> atlas_source = memnew(TileSetAtlasSource);
 				atlas_source->set_texture(resource);
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("Add a new atlas source"));
 				undo_redo->add_do_method(*tile_set, "add_source", atlas_source, source_id);
 				undo_redo->add_do_method(*atlas_source, "set_texture_region_size", tile_set->get_tile_size());
@@ -256,6 +257,7 @@ void TileSetEditor::_source_delete_pressed() {
 	Ref<TileSetSource> source = tile_set->get_source(to_delete);
 
 	// Remove the source.
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Remove source"));
 	undo_redo->add_do_method(*tile_set, "remove_source", to_delete);
 	undo_redo->add_undo_method(*tile_set, "add_source", source, to_delete);
@@ -274,6 +276,7 @@ void TileSetEditor::_source_add_id_pressed(int p_id_pressed) {
 			Ref<TileSetAtlasSource> atlas_source = memnew(TileSetAtlasSource);
 
 			// Add a new source.
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add atlas source"));
 			undo_redo->add_do_method(*tile_set, "add_source", atlas_source, source_id);
 			undo_redo->add_do_method(*atlas_source, "set_texture_region_size", tile_set->get_tile_size());
@@ -288,6 +291,7 @@ void TileSetEditor::_source_add_id_pressed(int p_id_pressed) {
 			Ref<TileSetScenesCollectionSource> scene_collection_source = memnew(TileSetScenesCollectionSource);
 
 			// Add a new source.
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add atlas source"));
 			undo_redo->add_do_method(*tile_set, "add_source", scene_collection_source, source_id);
 			undo_redo->add_undo_method(*tile_set, "remove_source", source_id);
@@ -361,6 +365,7 @@ void TileSetEditor::_patterns_item_list_gui_input(const Ref<InputEvent> &p_event
 
 	if (ED_IS_SHORTCUT("tiles_editor/delete", p_event) && p_event->is_pressed() && !p_event->is_echo()) {
 		Vector<int> selected = patterns_item_list->get_selected_items();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Remove TileSet patterns"));
 		for (int i = 0; i < selected.size(); i++) {
 			int pattern_index = selected[i];
@@ -666,8 +671,6 @@ void TileSetEditor::edit(Ref<TileSet> p_tile_set) {
 TileSetEditor::TileSetEditor() {
 	singleton = this;
 
-	undo_redo = EditorNode::get_undo_redo();
-
 	set_process_internal(true);
 
 	// TabBar.
diff --git a/editor/plugins/tiles/tile_set_editor.h b/editor/plugins/tiles/tile_set_editor.h
index 76a471db74..95697f7ecc 100644
--- a/editor/plugins/tiles/tile_set_editor.h
+++ b/editor/plugins/tiles/tile_set_editor.h
@@ -39,8 +39,6 @@
 #include "tile_set_atlas_source_editor.h"
 #include "tile_set_scenes_collection_source_editor.h"
 
-class EditorUndoRedoManager;
-
 class TileSetEditor : public VBoxContainer {
 	GDCLASS(TileSetEditor, VBoxContainer);
 
@@ -60,8 +58,6 @@ private:
 	TileSetAtlasSourceEditor *tile_set_atlas_source_editor = nullptr;
 	TileSetScenesCollectionSourceEditor *tile_set_scenes_collection_source_editor = nullptr;
 
-	Ref<EditorUndoRedoManager> undo_redo;
-
 	void _drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from);
 	bool _can_drop_data_fw(const Point2 &p_point, const Variant &p_data, Control *p_from) const;
 
diff --git a/editor/plugins/visual_shader_editor_plugin.cpp b/editor/plugins/visual_shader_editor_plugin.cpp
index adc93c0752..a51396d712 100644
--- a/editor/plugins/visual_shader_editor_plugin.cpp
+++ b/editor/plugins/visual_shader_editor_plugin.cpp
@@ -1113,6 +1113,8 @@ void VisualShaderEditor::edit(VisualShader *p_visual_shader) {
 		}
 		visual_shader->set_graph_offset(graph->get_scroll_ofs() / EDSCALE);
 		_set_mode(visual_shader->get_mode());
+
+		_update_nodes();
 	} else {
 		if (visual_shader.is_valid()) {
 			Callable ce = callable_mp(this, &VisualShaderEditor::_update_preview);
@@ -1659,6 +1661,7 @@ void VisualShaderEditor::_update_parameters(bool p_update_refs) {
 }
 
 void VisualShaderEditor::_update_parameter_refs(HashSet<String> &p_deleted_names) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (int i = 0; i < VisualShader::TYPE_MAX; i++) {
 		VisualShader::Type type = VisualShader::Type(i);
 
@@ -1758,6 +1761,7 @@ void VisualShaderEditor::_add_input_port(int p_node, int p_port, int p_port_type
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Input Port"));
 	undo_redo->add_do_method(node.ptr(), "add_input_port", p_port, p_port_type, p_name);
 	undo_redo->add_undo_method(node.ptr(), "remove_input_port", p_port);
@@ -1773,6 +1777,7 @@ void VisualShaderEditor::_add_output_port(int p_node, int p_port, int p_port_typ
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Add Output Port"));
 	undo_redo->add_do_method(node.ptr(), "add_output_port", p_port, p_port_type, p_name);
 	undo_redo->add_undo_method(node.ptr(), "remove_output_port", p_port);
@@ -1788,6 +1793,7 @@ void VisualShaderEditor::_change_input_port_type(int p_type, int p_node, int p_p
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Input Port Type"));
 	undo_redo->add_do_method(node.ptr(), "set_input_port_type", p_port, p_type);
 	undo_redo->add_undo_method(node.ptr(), "set_input_port_type", p_port, node->get_input_port_type(p_port));
@@ -1803,6 +1809,7 @@ void VisualShaderEditor::_change_output_port_type(int p_type, int p_node, int p_
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Output Port Type"));
 	undo_redo->add_do_method(node.ptr(), "set_output_port_type", p_port, p_type);
 	undo_redo->add_undo_method(node.ptr(), "set_output_port_type", p_port, node->get_output_port_type(p_port));
@@ -1831,6 +1838,7 @@ void VisualShaderEditor::_change_input_port_name(const String &p_text, Object *p
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Input Port Name"));
 	undo_redo->add_do_method(node.ptr(), "set_input_port_name", p_port_id, validated_name);
 	undo_redo->add_undo_method(node.ptr(), "set_input_port_name", p_port_id, node->get_input_port_name(p_port_id));
@@ -1857,6 +1865,7 @@ void VisualShaderEditor::_change_output_port_name(const String &p_text, Object *
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Change Output Port Name"));
 	undo_redo->add_do_method(node.ptr(), "set_output_port_name", p_port_id, validated_name);
 	undo_redo->add_undo_method(node.ptr(), "set_output_port_name", p_port_id, prev_name);
@@ -1869,6 +1878,7 @@ void VisualShaderEditor::_expand_output_port(int p_node, int p_port, bool p_expa
 	Ref<VisualShaderNode> node = visual_shader->get_node(type, p_node);
 	ERR_FAIL_COND(!node.is_valid());
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (p_expand) {
 		undo_redo->create_action(TTR("Expand Output Port"));
 	} else {
@@ -1966,6 +1976,7 @@ void VisualShaderEditor::_remove_input_port(int p_node, int p_port) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Remove Input Port"));
 
 	List<VisualShader::Connection> conns;
@@ -2015,6 +2026,7 @@ void VisualShaderEditor::_remove_output_port(int p_node, int p_port) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Remove Output Port"));
 
 	List<VisualShader::Connection> conns;
@@ -2081,6 +2093,7 @@ void VisualShaderEditor::_expression_focus_out(Object *code_edit, int p_node) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set VisualShader Expression"));
 	undo_redo->add_do_method(node.ptr(), "set_expression", expression_box->get_text());
 	undo_redo->add_undo_method(node.ptr(), "set_expression", node->get_expression());
@@ -2144,6 +2157,7 @@ void VisualShaderEditor::_node_resized(const Vector2 &p_new_size, int p_type, in
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Resize VisualShader Node"), UndoRedo::MERGE_ENDS);
 	undo_redo->add_do_method(this, "_set_node_size", p_type, p_node, p_new_size);
 	undo_redo->add_undo_method(this, "_set_node_size", p_type, p_node, node->get_size());
@@ -2160,6 +2174,7 @@ void VisualShaderEditor::_preview_select_port(int p_node, int p_port) {
 	if (node->get_output_port_for_preview() == p_port) {
 		p_port = -1; //toggle it
 	}
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(p_port == -1 ? TTR("Hide Port Preview") : TTR("Show Port Preview"));
 	undo_redo->add_do_method(node.ptr(), "set_output_port_for_preview", p_port);
 	undo_redo->add_undo_method(node.ptr(), "set_output_port_for_preview", prev_port);
@@ -2205,6 +2220,7 @@ void VisualShaderEditor::_comment_title_popup_hide() {
 	if (node->get_title() == comment_title_change_edit->get_text()) {
 		return; // nothing changed - ignored
 	}
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Comment Node Title"));
 	undo_redo->add_do_method(node.ptr(), "set_title", comment_title_change_edit->get_text());
 	undo_redo->add_undo_method(node.ptr(), "set_title", node->get_title());
@@ -2247,6 +2263,7 @@ void VisualShaderEditor::_comment_desc_popup_hide() {
 	if (node->get_description() == comment_desc_change_edit->get_text()) {
 		return; // nothing changed - ignored
 	}
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Comment Node Description"));
 	undo_redo->add_do_method(node.ptr(), "set_description", comment_desc_change_edit->get_text());
 	undo_redo->add_undo_method(node.ptr(), "set_description", node->get_title());
@@ -2267,6 +2284,7 @@ void VisualShaderEditor::_parameter_line_edit_changed(const String &p_text, int
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Parameter Name"));
 	undo_redo->add_do_method(node.ptr(), "set_parameter_name", validated_name);
 	undo_redo->add_undo_method(node.ptr(), "set_parameter_name", node->get_parameter_name());
@@ -2302,6 +2320,7 @@ void VisualShaderEditor::_port_edited(const StringName &p_property, const Varian
 	Ref<VisualShaderNode> vsn = visual_shader->get_node(type, editing_node);
 	ERR_FAIL_COND(!vsn.is_valid());
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Set Input Default Port"));
 
 	Ref<VisualShaderNodeCustom> custom = Object::cast_to<VisualShaderNodeCustom>(vsn.ptr());
@@ -2725,6 +2744,7 @@ void VisualShaderEditor::_add_node(int p_idx, const Vector<Variant> &p_ops, Stri
 
 	int id_to_use = visual_shader->get_valid_node_id(type);
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (p_resource_path.is_empty()) {
 		undo_redo->create_action(TTR("Add Node to Visual Shader"));
 	} else {
@@ -2894,6 +2914,7 @@ void VisualShaderEditor::_add_node(int p_idx, const Vector<Variant> &p_ops, Stri
 }
 
 void VisualShaderEditor::_add_varying(const String &p_name, VisualShader::VaryingMode p_mode, VisualShader::VaryingType p_type) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(vformat(TTR("Add Varying to Visual Shader: %s"), p_name));
 
 	undo_redo->add_do_method(visual_shader.ptr(), "add_varying", p_name, p_mode, p_type);
@@ -2928,6 +2949,7 @@ void VisualShaderEditor::_add_varying(const String &p_name, VisualShader::Varyin
 }
 
 void VisualShaderEditor::_remove_varying(const String &p_name) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(vformat(TTR("Remove Varying from Visual Shader: %s"), p_name));
 
 	VisualShader::VaryingMode var_mode = visual_shader->get_varying_mode(p_name);
@@ -3015,6 +3037,7 @@ void VisualShaderEditor::_node_dragged(const Vector2 &p_from, const Vector2 &p_t
 void VisualShaderEditor::_nodes_dragged() {
 	drag_dirty = false;
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Node(s) Moved"));
 
 	for (const DragOp &E : drag_buffer) {
@@ -3038,6 +3061,7 @@ void VisualShaderEditor::_connection_request(const String &p_from, int p_from_in
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Nodes Connected"));
 
 	List<VisualShader::Connection> conns;
@@ -3069,6 +3093,7 @@ void VisualShaderEditor::_disconnection_request(const String &p_from, int p_from
 	int from = p_from.to_int();
 	int to = p_to.to_int();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Nodes Disconnected"));
 	undo_redo->add_do_method(visual_shader.ptr(), "disconnect_nodes", type, from, p_from_index, to, p_to_index);
 	undo_redo->add_undo_method(visual_shader.ptr(), "connect_nodes", type, from, p_from_index, to, p_to_index);
@@ -3108,6 +3133,7 @@ void VisualShaderEditor::_delete_nodes(int p_type, const List<int> &p_nodes) {
 	List<VisualShader::Connection> conns;
 	visual_shader->get_node_connections(type, &conns);
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	for (const int &F : p_nodes) {
 		for (const VisualShader::Connection &E : conns) {
 			if (E.from_node == F || E.to_node == F) {
@@ -3180,6 +3206,7 @@ void VisualShaderEditor::_delete_nodes(int p_type, const List<int> &p_nodes) {
 }
 
 void VisualShaderEditor::_replace_node(VisualShader::Type p_type_id, int p_node_id, const StringName &p_from, const StringName &p_to) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->add_do_method(visual_shader.ptr(), "replace_node", p_type_id, p_node_id, p_to);
 	undo_redo->add_undo_method(visual_shader.ptr(), "replace_node", p_type_id, p_node_id, p_from);
 }
@@ -3214,6 +3241,7 @@ void VisualShaderEditor::_update_parameter(VisualShader::Type p_type_id, int p_n
 void VisualShaderEditor::_convert_constants_to_parameters(bool p_vice_versa) {
 	VisualShader::Type type_id = get_current_shader_type();
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (!p_vice_versa) {
 		undo_redo->create_action(TTR("Convert Constant Node(s) To Parameter(s)"));
 	} else {
@@ -3412,6 +3440,7 @@ void VisualShaderEditor::_delete_node_request(int p_type, int p_node) {
 	List<int> to_erase;
 	to_erase.push_back(p_node);
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete VisualShader Node"));
 	_delete_nodes(p_type, to_erase);
 	undo_redo->commit_action();
@@ -3440,6 +3469,7 @@ void VisualShaderEditor::_delete_nodes_request(const TypedArray<StringName> &p_n
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("Delete VisualShader Node(s)"));
 	_delete_nodes(get_current_shader_type(), to_erase);
 	undo_redo->commit_action();
@@ -3852,6 +3882,7 @@ void VisualShaderEditor::_dup_copy_nodes(int p_type, List<CopyItem> &r_items, Li
 }
 
 void VisualShaderEditor::_dup_paste_nodes(int p_type, List<CopyItem> &r_items, const List<VisualShader::Connection> &p_connections, const Vector2 &p_offset, bool p_duplicate) {
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (p_duplicate) {
 		undo_redo->create_action(TTR("Duplicate VisualShader Node(s)"));
 	} else {
@@ -3970,6 +4001,7 @@ void VisualShaderEditor::_copy_nodes(bool p_cut) {
 	_dup_copy_nodes(get_current_shader_type(), copy_items_buffer, copy_connections_buffer);
 
 	if (p_cut) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Cut VisualShader Node(s)"));
 
 		List<int> ids;
@@ -4240,6 +4272,7 @@ void VisualShaderEditor::_float_constant_selected(int p_which) {
 		return; // same
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(vformat(TTR("Set Constant: %s"), float_constant_defs[p_which].name));
 	undo_redo->add_do_method(node.ptr(), "set_constant", float_constant_defs[p_which].value);
 	undo_redo->add_undo_method(node.ptr(), "set_constant", node->get_constant());
@@ -4500,6 +4533,7 @@ void VisualShaderEditor::drop_data_fw(const Point2 &p_point, const Variant &p_da
 			saved_node_pos_dirty = true;
 			_add_node(idx, add_options[idx].ops);
 		} else if (d.has("files")) {
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			undo_redo->create_action(TTR("Add Node(s) to Visual Shader"));
 
 			if (d["files"].get_type() == Variant::PACKED_STRING_ARRAY) {
@@ -5675,8 +5709,6 @@ VisualShaderEditor::VisualShaderEditor() {
 
 	_update_options_menu();
 
-	undo_redo = EditorNode::get_undo_redo();
-
 	Ref<VisualShaderNodePluginDefault> default_plugin;
 	default_plugin.instantiate();
 	default_plugin->set_editor(this);
diff --git a/editor/plugins/visual_shader_editor_plugin.h b/editor/plugins/visual_shader_editor_plugin.h
index e673051eb3..e9d6257f81 100644
--- a/editor/plugins/visual_shader_editor_plugin.h
+++ b/editor/plugins/visual_shader_editor_plugin.h
@@ -41,7 +41,6 @@ class GraphEdit;
 class GraphNode;
 
 class VisualShaderEditor;
-class EditorUndoRedoManager;
 
 class VisualShaderNodePlugin : public RefCounted {
 	GDCLASS(VisualShaderNodePlugin, RefCounted);
@@ -185,7 +184,6 @@ class VisualShaderEditor : public VBoxContainer {
 	PanelContainer *error_panel = nullptr;
 	Label *error_label = nullptr;
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	Point2 saved_node_pos;
 	bool saved_node_pos_dirty = false;
 
diff --git a/editor/rename_dialog.cpp b/editor/rename_dialog.cpp
index f918570c66..74c123d942 100644
--- a/editor/rename_dialog.cpp
+++ b/editor/rename_dialog.cpp
@@ -46,9 +46,8 @@
 #include "scene/gui/separator.h"
 #include "scene/gui/tab_container.h"
 
-RenameDialog::RenameDialog(SceneTreeEditor *p_scene_tree_editor, Ref<EditorUndoRedoManager> p_undo_redo) {
+RenameDialog::RenameDialog(SceneTreeEditor *p_scene_tree_editor) {
 	scene_tree_editor = p_scene_tree_editor;
-	undo_redo = p_undo_redo;
 	preview_node = nullptr;
 
 	set_title(TTR("Batch Rename"));
@@ -582,7 +581,8 @@ void RenameDialog::rename() {
 	// Forward recursive as opposed to the actual renaming.
 	_iterate_scene(root_node, selected_node_list, &global_count);
 
-	if (undo_redo.is_valid() && !to_rename.is_empty()) {
+	if (!to_rename.is_empty()) {
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		undo_redo->create_action(TTR("Batch Rename"));
 
 		// Make sure to iterate reversed so that child nodes will find parents.
diff --git a/editor/rename_dialog.h b/editor/rename_dialog.h
index dac73b13b8..2a3fda59a4 100644
--- a/editor/rename_dialog.h
+++ b/editor/rename_dialog.h
@@ -42,8 +42,6 @@
 #include "scene/gui/spin_box.h"
 #include "scene/gui/tab_container.h"
 
-class EditorUndoRedoManager;
-
 class RenameDialog : public ConfirmationDialog {
 	GDCLASS(RenameDialog, ConfirmationDialog);
 
@@ -64,7 +62,6 @@ class RenameDialog : public ConfirmationDialog {
 	static void _error_handler(void *p_self, const char *p_func, const char *p_file, int p_line, const char *p_error, const char *p_errorexp, bool p_editor_notify, ErrorHandlerType p_type);
 
 	SceneTreeEditor *scene_tree_editor = nullptr;
-	Ref<EditorUndoRedoManager> undo_redo;
 	int global_count = 0;
 
 	LineEdit *lne_search = nullptr;
@@ -110,8 +107,7 @@ public:
 	void reset();
 	void rename();
 
-	RenameDialog(SceneTreeEditor *p_scene_tree_editor, Ref<EditorUndoRedoManager> p_undo_redo = Ref<EditorUndoRedoManager>());
-	~RenameDialog() {}
+	RenameDialog(SceneTreeEditor *p_scene_tree_editor);
 };
 
 #endif // MODULE_REGEX_ENABLED
diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index 8c0a30836f..9c3ef4cecc 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -226,28 +226,29 @@ void SceneTreeDock::_perform_instantiate_scenes(const Vector<String> &p_files, N
 		return;
 	}
 
-	editor_data->get_undo_redo()->create_action(TTR("Instantiate Scene(s)"));
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action(TTR("Instantiate Scene(s)"));
 
 	for (int i = 0; i < instances.size(); i++) {
 		Node *instantiated_scene = instances[i];
 
-		editor_data->get_undo_redo()->add_do_method(parent, "add_child", instantiated_scene, true);
+		undo_redo->add_do_method(parent, "add_child", instantiated_scene, true);
 		if (p_pos >= 0) {
-			editor_data->get_undo_redo()->add_do_method(parent, "move_child", instantiated_scene, p_pos + i);
+			undo_redo->add_do_method(parent, "move_child", instantiated_scene, p_pos + i);
 		}
-		editor_data->get_undo_redo()->add_do_method(instantiated_scene, "set_owner", edited_scene);
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "clear");
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "add_node", instantiated_scene);
-		editor_data->get_undo_redo()->add_do_reference(instantiated_scene);
-		editor_data->get_undo_redo()->add_undo_method(parent, "remove_child", instantiated_scene);
+		undo_redo->add_do_method(instantiated_scene, "set_owner", edited_scene);
+		undo_redo->add_do_method(editor_selection, "clear");
+		undo_redo->add_do_method(editor_selection, "add_node", instantiated_scene);
+		undo_redo->add_do_reference(instantiated_scene);
+		undo_redo->add_undo_method(parent, "remove_child", instantiated_scene);
 
 		String new_name = parent->validate_child_name(instantiated_scene);
 		EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
-		editor_data->get_undo_redo()->add_do_method(ed, "live_debug_instantiate_node", edited_scene->get_path_to(parent), p_files[i], new_name);
-		editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(parent)).path_join(new_name)));
+		undo_redo->add_do_method(ed, "live_debug_instantiate_node", edited_scene->get_path_to(parent), p_files[i], new_name);
+		undo_redo->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(parent)).path_join(new_name)));
 	}
 
-	editor_data->get_undo_redo()->commit_action();
+	undo_redo->commit_action();
 	_push_item(instances[instances.size() - 1]);
 	for (int i = 0; i < instances.size(); i++) {
 		emit_signal(SNAME("node_created"), instances[i]);
@@ -534,23 +535,24 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				return;
 			}
 
-			editor_data->get_undo_redo()->create_action(TTR("Detach Script"), UndoRedo::MERGE_DISABLE, EditorNode::get_singleton()->get_edited_scene());
-			editor_data->get_undo_redo()->add_do_method(EditorNode::get_singleton(), "push_item", (Script *)nullptr);
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+			undo_redo->create_action(TTR("Detach Script"), UndoRedo::MERGE_DISABLE, EditorNode::get_singleton()->get_edited_scene());
+			undo_redo->add_do_method(EditorNode::get_singleton(), "push_item", (Script *)nullptr);
 
 			for (int i = 0; i < selection.size(); i++) {
 				Node *n = Object::cast_to<Node>(selection[i]);
 				Ref<Script> existing = n->get_script();
 				Ref<Script> empty = EditorNode::get_singleton()->get_object_custom_type_base(n);
 				if (existing != empty) {
-					editor_data->get_undo_redo()->add_do_method(n, "set_script", empty);
-					editor_data->get_undo_redo()->add_undo_method(n, "set_script", existing);
+					undo_redo->add_do_method(n, "set_script", empty);
+					undo_redo->add_undo_method(n, "set_script", existing);
 				}
 			}
 
-			editor_data->get_undo_redo()->add_do_method(this, "_update_script_button");
-			editor_data->get_undo_redo()->add_undo_method(this, "_update_script_button");
+			undo_redo->add_do_method(this, "_update_script_button");
+			undo_redo->add_undo_method(this, "_update_script_button");
 
-			editor_data->get_undo_redo()->commit_action();
+			undo_redo->commit_action();
 		} break;
 		case TOOL_MOVE_UP:
 		case TOOL_MOVE_DOWN: {
@@ -604,11 +606,12 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				break; // one or more nodes can not be moved
 			}
 
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 			if (selection.size() == 1) {
-				editor_data->get_undo_redo()->create_action(TTR("Move Node In Parent"));
+				undo_redo->create_action(TTR("Move Node In Parent"));
 			}
 			if (selection.size() > 1) {
-				editor_data->get_undo_redo()->create_action(TTR("Move Nodes In Parent"));
+				undo_redo->create_action(TTR("Move Nodes In Parent"));
 			}
 
 			for (int i = 0; i < selection.size(); i++) {
@@ -621,11 +624,11 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				int bottom_node_pos = bottom_node->get_index();
 				int top_node_pos_next = top_node->get_index() + (MOVING_DOWN ? 1 : -1);
 
-				editor_data->get_undo_redo()->add_do_method(top_node->get_parent(), "move_child", top_node, top_node_pos_next);
-				editor_data->get_undo_redo()->add_undo_method(bottom_node->get_parent(), "move_child", bottom_node, bottom_node_pos);
+				undo_redo->add_do_method(top_node->get_parent(), "move_child", top_node, top_node_pos_next);
+				undo_redo->add_undo_method(bottom_node->get_parent(), "move_child", bottom_node, bottom_node_pos);
 			}
 
-			editor_data->get_undo_redo()->commit_action();
+			undo_redo->commit_action();
 
 		} break;
 		case TOOL_DUPLICATE: {
@@ -653,8 +656,9 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				break;
 			}
 
-			editor_data->get_undo_redo()->create_action(TTR("Duplicate Node(s)"), UndoRedo::MERGE_DISABLE, selection.front()->get());
-			editor_data->get_undo_redo()->add_do_method(editor_selection, "clear");
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+			undo_redo->create_action(TTR("Duplicate Node(s)"), UndoRedo::MERGE_DISABLE, selection.front()->get());
+			undo_redo->add_do_method(editor_selection, "clear");
 
 			Node *dupsingle = nullptr;
 
@@ -679,28 +683,28 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 
 				dup->set_name(parent->validate_child_name(dup));
 
-				editor_data->get_undo_redo()->add_do_method(add_below_node, "add_sibling", dup, true);
+				undo_redo->add_do_method(add_below_node, "add_sibling", dup, true);
 
 				for (Node *F : owned) {
 					if (!duplimap.has(F)) {
 						continue;
 					}
 					Node *d = duplimap[F];
-					editor_data->get_undo_redo()->add_do_method(d, "set_owner", node->get_owner());
+					undo_redo->add_do_method(d, "set_owner", node->get_owner());
 				}
-				editor_data->get_undo_redo()->add_do_method(editor_selection, "add_node", dup);
-				editor_data->get_undo_redo()->add_undo_method(parent, "remove_child", dup);
-				editor_data->get_undo_redo()->add_do_reference(dup);
+				undo_redo->add_do_method(editor_selection, "add_node", dup);
+				undo_redo->add_undo_method(parent, "remove_child", dup);
+				undo_redo->add_do_reference(dup);
 
 				EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
 
-				editor_data->get_undo_redo()->add_do_method(ed, "live_debug_duplicate_node", edited_scene->get_path_to(node), dup->get_name());
-				editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(parent)).path_join(dup->get_name())));
+				undo_redo->add_do_method(ed, "live_debug_duplicate_node", edited_scene->get_path_to(node), dup->get_name());
+				undo_redo->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(parent)).path_join(dup->get_name())));
 
 				add_below_node = dup;
 			}
 
-			editor_data->get_undo_redo()->commit_action();
+			undo_redo->commit_action();
 
 			if (dupsingle) {
 				_push_item(dupsingle);
@@ -769,29 +773,30 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				return;
 			}
 
-			editor_data->get_undo_redo()->create_action(TTR("Make node as Root"));
-			editor_data->get_undo_redo()->add_do_method(node->get_parent(), "remove_child", node);
-			editor_data->get_undo_redo()->add_do_method(EditorNode::get_singleton(), "set_edited_scene", node);
-			editor_data->get_undo_redo()->add_do_method(node, "add_child", root, true);
-			editor_data->get_undo_redo()->add_do_method(node, "set_scene_file_path", root->get_scene_file_path());
-			editor_data->get_undo_redo()->add_do_method(root, "set_scene_file_path", String());
-			editor_data->get_undo_redo()->add_do_method(node, "set_owner", (Object *)nullptr);
-			editor_data->get_undo_redo()->add_do_method(root, "set_owner", node);
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+			undo_redo->create_action(TTR("Make node as Root"));
+			undo_redo->add_do_method(node->get_parent(), "remove_child", node);
+			undo_redo->add_do_method(EditorNode::get_singleton(), "set_edited_scene", node);
+			undo_redo->add_do_method(node, "add_child", root, true);
+			undo_redo->add_do_method(node, "set_scene_file_path", root->get_scene_file_path());
+			undo_redo->add_do_method(root, "set_scene_file_path", String());
+			undo_redo->add_do_method(node, "set_owner", (Object *)nullptr);
+			undo_redo->add_do_method(root, "set_owner", node);
 			_node_replace_owner(root, root, node, MODE_DO);
 
-			editor_data->get_undo_redo()->add_undo_method(root, "set_scene_file_path", root->get_scene_file_path());
-			editor_data->get_undo_redo()->add_undo_method(node, "set_scene_file_path", String());
-			editor_data->get_undo_redo()->add_undo_method(node, "remove_child", root);
-			editor_data->get_undo_redo()->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", root);
-			editor_data->get_undo_redo()->add_undo_method(node->get_parent(), "add_child", node, true);
-			editor_data->get_undo_redo()->add_undo_method(node->get_parent(), "move_child", node, node->get_index());
-			editor_data->get_undo_redo()->add_undo_method(root, "set_owner", (Object *)nullptr);
-			editor_data->get_undo_redo()->add_undo_method(node, "set_owner", root);
+			undo_redo->add_undo_method(root, "set_scene_file_path", root->get_scene_file_path());
+			undo_redo->add_undo_method(node, "set_scene_file_path", String());
+			undo_redo->add_undo_method(node, "remove_child", root);
+			undo_redo->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", root);
+			undo_redo->add_undo_method(node->get_parent(), "add_child", node, true);
+			undo_redo->add_undo_method(node->get_parent(), "move_child", node, node->get_index());
+			undo_redo->add_undo_method(root, "set_owner", (Object *)nullptr);
+			undo_redo->add_undo_method(node, "set_owner", root);
 			_node_replace_owner(root, root, root, MODE_UNDO);
 
-			editor_data->get_undo_redo()->add_do_method(scene_tree, "update_tree");
-			editor_data->get_undo_redo()->add_undo_method(scene_tree, "update_tree");
-			editor_data->get_undo_redo()->commit_action();
+			undo_redo->add_do_method(scene_tree, "update_tree");
+			undo_redo->add_undo_method(scene_tree, "update_tree");
+			undo_redo->commit_action();
 		} break;
 		case TOOL_MULTI_EDIT: {
 			if (!profile_allow_editing) {
@@ -1013,7 +1018,7 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 				Node *node = e->get();
 				if (node) {
 					Node *root = EditorNode::get_singleton()->get_edited_scene();
-					Ref<EditorUndoRedoManager> undo_redo = editor_data->get_undo_redo();
+					Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 					if (!root) {
 						break;
 					}
@@ -1089,7 +1094,7 @@ void SceneTreeDock::_tool_selected(int p_tool, bool p_confirm_override) {
 			bool enabling = !first_selected->get()->is_unique_name_in_owner();
 
 			List<Node *> full_selection = editor_selection->get_full_selected_node_list();
-			Ref<EditorUndoRedoManager> undo_redo = editor_data->get_undo_redo();
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 
 			if (enabling) {
 				Vector<Node *> new_unique_nodes;
@@ -1212,19 +1217,21 @@ void SceneTreeDock::_property_selected(int p_idx) {
 }
 
 void SceneTreeDock::_perform_property_drop(Node *p_node, String p_property, Ref<Resource> p_res) {
-	editor_data->get_undo_redo()->create_action(vformat(TTR("Set %s"), p_property));
-	editor_data->get_undo_redo()->add_do_property(p_node, p_property, p_res);
-	editor_data->get_undo_redo()->add_undo_property(p_node, p_property, p_node->get(p_property));
-	editor_data->get_undo_redo()->commit_action();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action(vformat(TTR("Set %s"), p_property));
+	undo_redo->add_do_property(p_node, p_property, p_res);
+	undo_redo->add_undo_property(p_node, p_property, p_node->get(p_property));
+	undo_redo->commit_action();
 }
 
 void SceneTreeDock::add_root_node(Node *p_node) {
-	editor_data->get_undo_redo()->create_action_for_history(TTR("New Scene Root"), editor_data->get_current_edited_scene_history_id());
-	editor_data->get_undo_redo()->add_do_method(EditorNode::get_singleton(), "set_edited_scene", p_node);
-	editor_data->get_undo_redo()->add_do_method(scene_tree, "update_tree");
-	editor_data->get_undo_redo()->add_do_reference(p_node);
-	editor_data->get_undo_redo()->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
-	editor_data->get_undo_redo()->commit_action();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action_for_history(TTR("New Scene Root"), editor_data->get_current_edited_scene_history_id());
+	undo_redo->add_do_method(EditorNode::get_singleton(), "set_edited_scene", p_node);
+	undo_redo->add_do_method(scene_tree, "update_tree");
+	undo_redo->add_do_reference(p_node);
+	undo_redo->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
+	undo_redo->commit_action();
 }
 
 void SceneTreeDock::_notification(int p_what) {
@@ -1369,7 +1376,7 @@ void SceneTreeDock::_notification(int p_what) {
 
 void SceneTreeDock::_node_replace_owner(Node *p_base, Node *p_node, Node *p_root, ReplaceOwnerMode p_mode) {
 	if (p_node->get_owner() == p_base && p_node != p_root) {
-		Ref<EditorUndoRedoManager> undo_redo = editor_data->get_undo_redo();
+		Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 		switch (p_mode) {
 			case MODE_BIDI: {
 				bool disable_unique = p_node->is_unique_name_in_owner() && p_root->get_node_or_null(UNIQUE_NODE_PREFIX + String(p_node->get_name())) != nullptr;
@@ -1603,8 +1610,9 @@ void SceneTreeDock::perform_node_renames(Node *p_base, HashMap<Node *, NodePath>
 		Variant old_variant = p_base->get(propertyname);
 		Variant updated_variant = old_variant;
 		if (_check_node_path_recursive(p_base, updated_variant, p_renames)) {
-			editor_data->get_undo_redo()->add_do_property(p_base, propertyname, updated_variant);
-			editor_data->get_undo_redo()->add_undo_property(p_base, propertyname, old_variant);
+			Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+			undo_redo->add_do_property(p_base, propertyname, updated_variant);
+			undo_redo->add_undo_property(p_base, propertyname, old_variant);
 			p_base->set(propertyname, updated_variant);
 		}
 	}
@@ -1649,6 +1657,7 @@ void SceneTreeDock::perform_node_renames(Node *p_base, HashMap<Node *, NodePath>
 						}
 
 						HashMap<Node *, NodePath>::Iterator found_path = p_renames->find(n);
+						Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 						if (found_path) {
 							if (found_path->value == NodePath()) {
 								//will be erased
@@ -1662,12 +1671,12 @@ void SceneTreeDock::perform_node_renames(Node *p_base, HashMap<Node *, NodePath>
 									ERR_FAIL_COND(!EI); //another bug
 								}
 
-								editor_data->get_undo_redo()->add_do_method(anim.ptr(), "remove_track", idx);
-								editor_data->get_undo_redo()->add_undo_method(anim.ptr(), "add_track", anim->track_get_type(i), idx);
-								editor_data->get_undo_redo()->add_undo_method(anim.ptr(), "track_set_path", idx, track_np);
-								editor_data->get_undo_redo()->add_undo_method(anim.ptr(), "track_set_interpolation_type", idx, anim->track_get_interpolation_type(i));
+								undo_redo->add_do_method(anim.ptr(), "remove_track", idx);
+								undo_redo->add_undo_method(anim.ptr(), "add_track", anim->track_get_type(i), idx);
+								undo_redo->add_undo_method(anim.ptr(), "track_set_path", idx, track_np);
+								undo_redo->add_undo_method(anim.ptr(), "track_set_interpolation_type", idx, anim->track_get_interpolation_type(i));
 								for (int j = 0; j < anim->track_get_key_count(i); j++) {
-									editor_data->get_undo_redo()->add_undo_method(anim.ptr(), "track_insert_key", idx, anim->track_get_key_time(i, j), anim->track_get_key_value(i, j), anim->track_get_key_transition(i, j));
+									undo_redo->add_undo_method(anim.ptr(), "track_insert_key", idx, anim->track_get_key_time(i, j), anim->track_get_key_value(i, j), anim->track_get_key_transition(i, j));
 								}
 
 								ran.erase(i); //byebye channel
@@ -1680,8 +1689,8 @@ void SceneTreeDock::perform_node_renames(Node *p_base, HashMap<Node *, NodePath>
 								if (new_path == track_np) {
 									continue; //bleh
 								}
-								editor_data->get_undo_redo()->add_do_method(anim.ptr(), "track_set_path", i, new_path);
-								editor_data->get_undo_redo()->add_undo_method(anim.ptr(), "track_set_path", i, track_np);
+								undo_redo->add_do_method(anim.ptr(), "track_set_path", i, new_path);
+								undo_redo->add_undo_method(anim.ptr(), "track_set_path", i, track_np);
 							}
 						}
 					}
@@ -1815,7 +1824,8 @@ void SceneTreeDock::_do_reparent(Node *p_new_parent, int p_position_in_parent, V
 	// Sort by tree order, so re-adding is easy.
 	p_nodes.sort_custom<Node::Comparator>();
 
-	editor_data->get_undo_redo()->create_action(TTR("Reparent Node"), UndoRedo::MERGE_DISABLE, p_nodes[0]);
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action(TTR("Reparent Node"), UndoRedo::MERGE_DISABLE, p_nodes[0]);
 
 	HashMap<Node *, NodePath> path_renames;
 	Vector<StringName> former_names;
@@ -1842,12 +1852,12 @@ void SceneTreeDock::_do_reparent(Node *p_new_parent, int p_position_in_parent, V
 		}
 
 		if (!same_parent) {
-			editor_data->get_undo_redo()->add_do_method(node->get_parent(), "remove_child", node);
-			editor_data->get_undo_redo()->add_do_method(new_parent, "add_child", node, true);
+			undo_redo->add_do_method(node->get_parent(), "remove_child", node);
+			undo_redo->add_do_method(new_parent, "add_child", node, true);
 		}
 
 		if (p_position_in_parent >= 0 || same_parent) {
-			editor_data->get_undo_redo()->add_do_method(new_parent, "move_child", node, p_position_in_parent + inc);
+			undo_redo->add_do_method(new_parent, "move_child", node, p_position_in_parent + inc);
 		}
 
 		EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
@@ -1877,29 +1887,29 @@ void SceneTreeDock::_do_reparent(Node *p_new_parent, int p_position_in_parent, V
 			}
 		}
 
-		editor_data->get_undo_redo()->add_do_method(ed, "live_debug_reparent_node", edited_scene->get_path_to(node), edited_scene->get_path_to(new_parent), new_name, p_position_in_parent + inc);
-		editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_reparent_node", NodePath(String(edited_scene->get_path_to(new_parent)).path_join(new_name)), edited_scene->get_path_to(node->get_parent()), node->get_name(), node->get_index());
+		undo_redo->add_do_method(ed, "live_debug_reparent_node", edited_scene->get_path_to(node), edited_scene->get_path_to(new_parent), new_name, p_position_in_parent + inc);
+		undo_redo->add_undo_method(ed, "live_debug_reparent_node", NodePath(String(edited_scene->get_path_to(new_parent)).path_join(new_name)), edited_scene->get_path_to(node->get_parent()), node->get_name(), node->get_index());
 
 		if (p_keep_global_xform) {
 			if (Object::cast_to<Node2D>(node)) {
-				editor_data->get_undo_redo()->add_do_method(node, "set_global_transform", Object::cast_to<Node2D>(node)->get_global_transform());
+				undo_redo->add_do_method(node, "set_global_transform", Object::cast_to<Node2D>(node)->get_global_transform());
 			}
 			if (Object::cast_to<Node3D>(node)) {
-				editor_data->get_undo_redo()->add_do_method(node, "set_global_transform", Object::cast_to<Node3D>(node)->get_global_transform());
+				undo_redo->add_do_method(node, "set_global_transform", Object::cast_to<Node3D>(node)->get_global_transform());
 			}
 			if (Object::cast_to<Control>(node)) {
-				editor_data->get_undo_redo()->add_do_method(node, "set_global_position", Object::cast_to<Control>(node)->get_global_position());
+				undo_redo->add_do_method(node, "set_global_position", Object::cast_to<Control>(node)->get_global_position());
 			}
 		}
 
-		editor_data->get_undo_redo()->add_do_method(this, "_set_owners", edited_scene, owners);
+		undo_redo->add_do_method(this, "_set_owners", edited_scene, owners);
 
 		if (AnimationPlayerEditor::get_singleton()->get_track_editor()->get_root() == node) {
-			editor_data->get_undo_redo()->add_do_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", node);
+			undo_redo->add_do_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", node);
 		}
 
-		editor_data->get_undo_redo()->add_undo_method(new_parent, "remove_child", node);
-		editor_data->get_undo_redo()->add_undo_method(node, "set_name", former_names[ni]);
+		undo_redo->add_undo_method(new_parent, "remove_child", node);
+		undo_redo->add_undo_method(node, "set_name", former_names[ni]);
 
 		inc++;
 	}
@@ -1917,29 +1927,29 @@ void SceneTreeDock::_do_reparent(Node *p_new_parent, int p_position_in_parent, V
 
 		int child_pos = node->get_index();
 
-		editor_data->get_undo_redo()->add_undo_method(node->get_parent(), "add_child", node, true);
-		editor_data->get_undo_redo()->add_undo_method(node->get_parent(), "move_child", node, child_pos);
-		editor_data->get_undo_redo()->add_undo_method(this, "_set_owners", edited_scene, owners);
+		undo_redo->add_undo_method(node->get_parent(), "add_child", node, true);
+		undo_redo->add_undo_method(node->get_parent(), "move_child", node, child_pos);
+		undo_redo->add_undo_method(this, "_set_owners", edited_scene, owners);
 		if (AnimationPlayerEditor::get_singleton()->get_track_editor()->get_root() == node) {
-			editor_data->get_undo_redo()->add_undo_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", node);
+			undo_redo->add_undo_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", node);
 		}
 
 		if (p_keep_global_xform) {
 			if (Object::cast_to<Node2D>(node)) {
-				editor_data->get_undo_redo()->add_undo_method(node, "set_transform", Object::cast_to<Node2D>(node)->get_transform());
+				undo_redo->add_undo_method(node, "set_transform", Object::cast_to<Node2D>(node)->get_transform());
 			}
 			if (Object::cast_to<Node3D>(node)) {
-				editor_data->get_undo_redo()->add_undo_method(node, "set_transform", Object::cast_to<Node3D>(node)->get_transform());
+				undo_redo->add_undo_method(node, "set_transform", Object::cast_to<Node3D>(node)->get_transform());
 			}
 			if (Object::cast_to<Control>(node)) {
-				editor_data->get_undo_redo()->add_undo_method(node, "set_position", Object::cast_to<Control>(node)->get_position());
+				undo_redo->add_undo_method(node, "set_position", Object::cast_to<Control>(node)->get_position());
 			}
 		}
 	}
 
 	perform_node_renames(nullptr, &path_renames);
 
-	editor_data->get_undo_redo()->commit_action();
+	undo_redo->commit_action();
 }
 
 void SceneTreeDock::_script_created(Ref<Script> p_script) {
@@ -1949,34 +1959,35 @@ void SceneTreeDock::_script_created(Ref<Script> p_script) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (selected.size() == 1) {
 		Node *node = selected.front()->get();
 		Ref<Script> existing = node->get_script();
 
-		editor_data->get_undo_redo()->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, node);
-		editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "store_script_properties", node);
-		editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", node);
-		editor_data->get_undo_redo()->add_do_method(node, "set_script", p_script);
-		editor_data->get_undo_redo()->add_undo_method(node, "set_script", existing);
-		editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", node);
-		editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", node);
-		editor_data->get_undo_redo()->add_do_method(this, "_update_script_button");
-		editor_data->get_undo_redo()->add_undo_method(this, "_update_script_button");
-		editor_data->get_undo_redo()->commit_action();
+		undo_redo->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, node);
+		undo_redo->add_do_method(InspectorDock::get_singleton(), "store_script_properties", node);
+		undo_redo->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", node);
+		undo_redo->add_do_method(node, "set_script", p_script);
+		undo_redo->add_undo_method(node, "set_script", existing);
+		undo_redo->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", node);
+		undo_redo->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", node);
+		undo_redo->add_do_method(this, "_update_script_button");
+		undo_redo->add_undo_method(this, "_update_script_button");
+		undo_redo->commit_action();
 	} else {
-		editor_data->get_undo_redo()->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, selected.front()->get());
+		undo_redo->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, selected.front()->get());
 		for (Node *E : selected) {
 			Ref<Script> existing = E->get_script();
-			editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "store_script_properties", E);
-			editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", E);
-			editor_data->get_undo_redo()->add_do_method(E, "set_script", p_script);
-			editor_data->get_undo_redo()->add_undo_method(E, "set_script", existing);
-			editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", E);
-			editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", E);
-			editor_data->get_undo_redo()->add_do_method(this, "_update_script_button");
-			editor_data->get_undo_redo()->add_undo_method(this, "_update_script_button");
+			undo_redo->add_do_method(InspectorDock::get_singleton(), "store_script_properties", E);
+			undo_redo->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", E);
+			undo_redo->add_do_method(E, "set_script", p_script);
+			undo_redo->add_undo_method(E, "set_script", existing);
+			undo_redo->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", E);
+			undo_redo->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", E);
+			undo_redo->add_do_method(this, "_update_script_button");
+			undo_redo->add_undo_method(this, "_update_script_button");
 		}
-		editor_data->get_undo_redo()->commit_action();
+		undo_redo->commit_action();
 	}
 
 	_push_item(p_script.operator->());
@@ -1990,10 +2001,11 @@ void SceneTreeDock::_shader_created(Ref<Shader> p_shader) {
 
 	Ref<Shader> existing = selected_shader_material->get_shader();
 
-	editor_data->get_undo_redo()->create_action(TTR("Set Shader"));
-	editor_data->get_undo_redo()->add_do_method(selected_shader_material.ptr(), "set_shader", p_shader);
-	editor_data->get_undo_redo()->add_undo_method(selected_shader_material.ptr(), "set_shader", existing);
-	editor_data->get_undo_redo()->commit_action();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action(TTR("Set Shader"));
+	undo_redo->add_do_method(selected_shader_material.ptr(), "set_shader", p_shader);
+	undo_redo->add_undo_method(selected_shader_material.ptr(), "set_shader", existing);
+	undo_redo->commit_action();
 }
 
 void SceneTreeDock::_script_creation_closed() {
@@ -2058,11 +2070,8 @@ void SceneTreeDock::_delete_confirm(bool p_cut) {
 
 	EditorNode::get_singleton()->get_editor_plugins_over()->make_visible(false);
 
-	if (p_cut) {
-		editor_data->get_undo_redo()->create_action(TTR("Cut Node(s)"), UndoRedo::MERGE_DISABLE, remove_list.front()->get());
-	} else {
-		editor_data->get_undo_redo()->create_action(TTR("Remove Node(s)"), UndoRedo::MERGE_DISABLE, remove_list.front()->get());
-	}
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action(p_cut ? TTR("Cut Node(s)") : TTR("Remove Node(s)"), UndoRedo::MERGE_DISABLE, remove_list.front()->get());
 
 	bool entire_scene = false;
 
@@ -2074,11 +2083,11 @@ void SceneTreeDock::_delete_confirm(bool p_cut) {
 	}
 
 	if (entire_scene) {
-		editor_data->get_undo_redo()->add_do_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
-		editor_data->get_undo_redo()->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", edited_scene);
-		editor_data->get_undo_redo()->add_undo_method(edited_scene, "set_owner", edited_scene->get_owner());
-		editor_data->get_undo_redo()->add_undo_method(scene_tree, "update_tree");
-		editor_data->get_undo_redo()->add_undo_reference(edited_scene);
+		undo_redo->add_do_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
+		undo_redo->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", edited_scene);
+		undo_redo->add_undo_method(edited_scene, "set_owner", edited_scene->get_owner());
+		undo_redo->add_undo_method(scene_tree, "update_tree");
+		undo_redo->add_undo_reference(edited_scene);
 
 	} else {
 		remove_list.sort_custom<Node::Comparator>(); //sort nodes to keep positions
@@ -2107,21 +2116,21 @@ void SceneTreeDock::_delete_confirm(bool p_cut) {
 				owners.push_back(F);
 			}
 
-			editor_data->get_undo_redo()->add_do_method(n->get_parent(), "remove_child", n);
-			editor_data->get_undo_redo()->add_undo_method(n->get_parent(), "add_child", n, true);
-			editor_data->get_undo_redo()->add_undo_method(n->get_parent(), "move_child", n, n->get_index());
+			undo_redo->add_do_method(n->get_parent(), "remove_child", n);
+			undo_redo->add_undo_method(n->get_parent(), "add_child", n, true);
+			undo_redo->add_undo_method(n->get_parent(), "move_child", n, n->get_index());
 			if (AnimationPlayerEditor::get_singleton()->get_track_editor()->get_root() == n) {
-				editor_data->get_undo_redo()->add_undo_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", n);
+				undo_redo->add_undo_method(AnimationPlayerEditor::get_singleton()->get_track_editor(), "set_root", n);
 			}
-			editor_data->get_undo_redo()->add_undo_method(this, "_set_owners", edited_scene, owners);
-			editor_data->get_undo_redo()->add_undo_reference(n);
+			undo_redo->add_undo_method(this, "_set_owners", edited_scene, owners);
+			undo_redo->add_undo_reference(n);
 
 			EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
-			editor_data->get_undo_redo()->add_do_method(ed, "live_debug_remove_and_keep_node", edited_scene->get_path_to(n), n->get_instance_id());
-			editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_restore_node", n->get_instance_id(), edited_scene->get_path_to(n->get_parent()), n->get_index());
+			undo_redo->add_do_method(ed, "live_debug_remove_and_keep_node", edited_scene->get_path_to(n), n->get_instance_id());
+			undo_redo->add_undo_method(ed, "live_debug_restore_node", n->get_instance_id(), edited_scene->get_path_to(n->get_parent()), n->get_index());
 		}
 	}
-	editor_data->get_undo_redo()->commit_action();
+	undo_redo->commit_action();
 
 	// hack, force 2d editor viewport to refresh after deletion
 	if (CanvasItemEditor *editor = CanvasItemEditor::get_singleton()) {
@@ -2191,28 +2200,29 @@ void SceneTreeDock::_do_create(Node *p_parent) {
 	}
 	child->set_name(new_name);
 
-	editor_data->get_undo_redo()->create_action_for_history(TTR("Create Node"), editor_data->get_current_edited_scene_history_id());
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
+	undo_redo->create_action_for_history(TTR("Create Node"), editor_data->get_current_edited_scene_history_id());
 
 	if (edited_scene) {
-		editor_data->get_undo_redo()->add_do_method(p_parent, "add_child", child, true);
-		editor_data->get_undo_redo()->add_do_method(child, "set_owner", edited_scene);
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "clear");
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "add_node", child);
-		editor_data->get_undo_redo()->add_do_reference(child);
-		editor_data->get_undo_redo()->add_undo_method(p_parent, "remove_child", child);
+		undo_redo->add_do_method(p_parent, "add_child", child, true);
+		undo_redo->add_do_method(child, "set_owner", edited_scene);
+		undo_redo->add_do_method(editor_selection, "clear");
+		undo_redo->add_do_method(editor_selection, "add_node", child);
+		undo_redo->add_do_reference(child);
+		undo_redo->add_undo_method(p_parent, "remove_child", child);
 
 		EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
-		editor_data->get_undo_redo()->add_do_method(ed, "live_debug_create_node", edited_scene->get_path_to(p_parent), child->get_class(), new_name);
-		editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(p_parent)).path_join(new_name)));
+		undo_redo->add_do_method(ed, "live_debug_create_node", edited_scene->get_path_to(p_parent), child->get_class(), new_name);
+		undo_redo->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(p_parent)).path_join(new_name)));
 
 	} else {
-		editor_data->get_undo_redo()->add_do_method(EditorNode::get_singleton(), "set_edited_scene", child);
-		editor_data->get_undo_redo()->add_do_method(scene_tree, "update_tree");
-		editor_data->get_undo_redo()->add_do_reference(child);
-		editor_data->get_undo_redo()->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
+		undo_redo->add_do_method(EditorNode::get_singleton(), "set_edited_scene", child);
+		undo_redo->add_do_method(scene_tree, "update_tree");
+		undo_redo->add_do_reference(child);
+		undo_redo->add_undo_method(EditorNode::get_singleton(), "set_edited_scene", (Object *)nullptr);
 	}
 
-	editor_data->get_undo_redo()->commit_action();
+	undo_redo->commit_action();
 	_push_item(c);
 	editor_selection->clear();
 	editor_selection->add_node(child);
@@ -2393,7 +2403,7 @@ void SceneTreeDock::replace_node(Node *p_node, Node *p_by_node, bool p_keep_prop
 	}
 	//p_remove_old was added to support undo
 	if (p_remove_old) {
-		editor_data->get_undo_redo()->clear_history();
+		EditorNode::get_undo_redo()->clear_history();
 	}
 	newnode->set_name(newname);
 
@@ -2612,6 +2622,7 @@ void SceneTreeDock::_script_dropped(String p_file, NodePath p_to) {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	if (Input::get_singleton()->is_key_pressed(Key::CTRL)) {
 		Object *obj = ClassDB::instantiate(scr->get_instance_base_type());
 		ERR_FAIL_NULL(obj);
@@ -2626,29 +2637,29 @@ void SceneTreeDock::_script_dropped(String p_file, NodePath p_to) {
 		new_node->set_name(Node::adjust_name_casing(p_file.get_file().get_basename()));
 		new_node->set_script(scr);
 
-		editor_data->get_undo_redo()->create_action(TTR("Instantiate Script"));
-		editor_data->get_undo_redo()->add_do_method(n, "add_child", new_node, true);
-		editor_data->get_undo_redo()->add_do_method(new_node, "set_owner", edited_scene);
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "clear");
-		editor_data->get_undo_redo()->add_do_method(editor_selection, "add_node", new_node);
-		editor_data->get_undo_redo()->add_do_reference(new_node);
-		editor_data->get_undo_redo()->add_undo_method(n, "remove_child", new_node);
+		undo_redo->create_action(TTR("Instantiate Script"));
+		undo_redo->add_do_method(n, "add_child", new_node, true);
+		undo_redo->add_do_method(new_node, "set_owner", edited_scene);
+		undo_redo->add_do_method(editor_selection, "clear");
+		undo_redo->add_do_method(editor_selection, "add_node", new_node);
+		undo_redo->add_do_reference(new_node);
+		undo_redo->add_undo_method(n, "remove_child", new_node);
 
 		EditorDebuggerNode *ed = EditorDebuggerNode::get_singleton();
-		editor_data->get_undo_redo()->add_do_method(ed, "live_debug_create_node", edited_scene->get_path_to(n), new_node->get_class(), new_node->get_name());
-		editor_data->get_undo_redo()->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(n)).path_join(new_node->get_name())));
-		editor_data->get_undo_redo()->commit_action();
+		undo_redo->add_do_method(ed, "live_debug_create_node", edited_scene->get_path_to(n), new_node->get_class(), new_node->get_name());
+		undo_redo->add_undo_method(ed, "live_debug_remove_node", NodePath(String(edited_scene->get_path_to(n)).path_join(new_node->get_name())));
+		undo_redo->commit_action();
 	} else {
-		editor_data->get_undo_redo()->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, n);
-		editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "store_script_properties", n);
-		editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", n);
-		editor_data->get_undo_redo()->add_do_method(n, "set_script", scr);
-		editor_data->get_undo_redo()->add_undo_method(n, "set_script", n->get_script());
-		editor_data->get_undo_redo()->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", n);
-		editor_data->get_undo_redo()->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", n);
-		editor_data->get_undo_redo()->add_do_method(this, "_update_script_button");
-		editor_data->get_undo_redo()->add_undo_method(this, "_update_script_button");
-		editor_data->get_undo_redo()->commit_action();
+		undo_redo->create_action(TTR("Attach Script"), UndoRedo::MERGE_DISABLE, n);
+		undo_redo->add_do_method(InspectorDock::get_singleton(), "store_script_properties", n);
+		undo_redo->add_undo_method(InspectorDock::get_singleton(), "store_script_properties", n);
+		undo_redo->add_do_method(n, "set_script", scr);
+		undo_redo->add_undo_method(n, "set_script", n->get_script());
+		undo_redo->add_do_method(InspectorDock::get_singleton(), "apply_script_properties", n);
+		undo_redo->add_undo_method(InspectorDock::get_singleton(), "apply_script_properties", n);
+		undo_redo->add_do_method(this, "_update_script_button");
+		undo_redo->add_undo_method(this, "_update_script_button");
+		undo_redo->commit_action();
 	}
 }
 
@@ -3183,7 +3194,7 @@ List<Node *> SceneTreeDock::paste_nodes() {
 		owner = paste_parent;
 	}
 
-	Ref<EditorUndoRedoManager> &ur = editor_data->get_undo_redo();
+	Ref<EditorUndoRedoManager> &ur = EditorNode::get_undo_redo();
 	ur->create_action(TTR("Paste Node(s)"), UndoRedo::MERGE_DISABLE, EditorNode::get_singleton()->get_edited_scene());
 	ur->add_do_method(editor_selection, "clear");
 
@@ -3616,7 +3627,7 @@ SceneTreeDock::SceneTreeDock(Node *p_scene_root, EditorSelection *p_editor_selec
 	create_dialog->connect("favorites_updated", callable_mp(this, &SceneTreeDock::_update_create_root_dialog));
 
 #ifdef MODULE_REGEX_ENABLED
-	rename_dialog = memnew(RenameDialog(scene_tree, editor_data->get_undo_redo()));
+	rename_dialog = memnew(RenameDialog(scene_tree));
 	add_child(rename_dialog);
 #endif // MODULE_REGEX_ENABLED
 
diff --git a/editor/shader_globals_editor.cpp b/editor/shader_globals_editor.cpp
index 9b82e80072..22a1d49422 100644
--- a/editor/shader_globals_editor.cpp
+++ b/editor/shader_globals_editor.cpp
@@ -394,7 +394,7 @@ void ShaderGlobalsEditor::_variable_added() {
 		return;
 	}
 
-	Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	Variant value = create_var(RS::GlobalShaderParameterType(variable_type->get_selected()));
 
@@ -413,7 +413,7 @@ void ShaderGlobalsEditor::_variable_added() {
 }
 
 void ShaderGlobalsEditor::_variable_deleted(const String &p_variable) {
-	Ref<EditorUndoRedoManager> undo_redo = EditorNode::get_singleton()->get_undo_redo();
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_singleton()->get_undo_redo();
 
 	undo_redo->create_action(TTR("Add Shader Global Parameter"));
 	undo_redo->add_do_method(RS::get_singleton(), "global_shader_parameter_remove", p_variable);
diff --git a/editor/translations/extract.py b/editor/translations/extract.py
index cecdb3939d..04661abcbd 100755
--- a/editor/translations/extract.py
+++ b/editor/translations/extract.py
@@ -71,11 +71,24 @@ matches.sort()
 
 remaps = {}
 remap_re = re.compile(r'^\t*capitalize_string_remaps\["(?P<from>.+)"\] = (String::utf8\()?"(?P<to>.+)"')
+stop_words = set()
+stop_words_re = re.compile(r'^\t*"(?P<word>.+)",')
+is_inside_stop_words = False
 with open("editor/editor_property_name_processor.cpp") as f:
     for line in f:
-        m = remap_re.search(line)
-        if m:
-            remaps[m.group("from")] = m.group("to")
+        if is_inside_stop_words:
+            m = stop_words_re.search(line)
+            if m:
+                stop_words.add(m.group("word"))
+            else:
+                is_inside_stop_words = False
+        else:
+            m = remap_re.search(line)
+            if m:
+                remaps[m.group("from")] = m.group("to")
+
+        if not is_inside_stop_words and not stop_words:
+            is_inside_stop_words = "stop_words = " in line
 
 
 main_po = """
@@ -147,9 +160,12 @@ capitalize_re = re.compile(r"(?<=\D)(?=\d)|(?<=\d)(?=\D([a-z]|\d))")
 def _process_editor_string(name):
     # See EditorPropertyNameProcessor::process_string().
     capitalized_parts = []
-    for segment in name.split("_"):
-        if not segment:
+    parts = list(filter(bool, name.split("_")))  # Non-empty only.
+    for i, segment in enumerate(parts):
+        if i > 0 and i + 1 < len(parts) and segment in stop_words:
+            capitalized_parts.append(segment)
             continue
+
         remapped = remaps.get(segment)
         if remapped:
             capitalized_parts.append(remapped)
diff --git a/main/main.cpp b/main/main.cpp
index 55f7177258..6ab7ca6a50 100644
--- a/main/main.cpp
+++ b/main/main.cpp
@@ -2952,6 +2952,7 @@ bool Main::start() {
 				DisplayServer::get_singleton()->set_context(DisplayServer::CONTEXT_EDITOR);
 				if (!debug_server_uri.is_empty()) {
 					EditorDebuggerNode::get_singleton()->start(debug_server_uri);
+					EditorDebuggerNode::get_singleton()->set_keep_open(true);
 				}
 			}
 #endif
diff --git a/modules/gdscript/gdscript_analyzer.cpp b/modules/gdscript/gdscript_analyzer.cpp
index 9b0dc9577b..429f53e59f 100644
--- a/modules/gdscript/gdscript_analyzer.cpp
+++ b/modules/gdscript/gdscript_analyzer.cpp
@@ -2563,8 +2563,14 @@ void GDScriptAnalyzer::reduce_call(GDScriptParser::CallNode *p_call, bool p_is_a
 			parser->push_warning(p_call, GDScriptWarning::RETURN_VALUE_DISCARDED, p_call->function_name);
 		}
 
-		if (is_static && !base_type.is_meta_type && !(callee_type != GDScriptParser::Node::SUBSCRIPT && parser->current_function != nullptr && parser->current_function->is_static)) {
-			parser->push_warning(p_call, GDScriptWarning::STATIC_CALLED_ON_INSTANCE, p_call->function_name, base_type.to_string());
+		if (is_static && !base_type.is_meta_type && !(is_self && parser->current_function != nullptr && parser->current_function->is_static)) {
+			String caller_type = String(base_type.native_type);
+
+			if (caller_type.is_empty()) {
+				caller_type = base_type.to_string();
+			}
+
+			parser->push_warning(p_call, GDScriptWarning::STATIC_CALLED_ON_INSTANCE, p_call->function_name, caller_type);
 		}
 #endif // DEBUG_ENABLED
 
diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index 7f2f49f336..7074520a34 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -3740,6 +3740,12 @@ bool GDScriptParser::export_annotations(const AnnotationNode *p_annotation, Node
 	// This is called after the analyzer is done finding the type, so this should be set here.
 	DataType export_type = variable->get_datatype();
 
+	if (p_annotation->name == SNAME("@export_range")) {
+		if (export_type.builtin_type == Variant::INT) {
+			variable->export_info.type = Variant::INT;
+		}
+	}
+
 	if (p_annotation->name == SNAME("@export")) {
 		if (variable->datatype_specifier == nullptr && variable->initializer == nullptr) {
 			push_error(R"(Cannot use simple "@export" annotation with variable without type or initializer, since type can't be inferred.)", p_annotation);
diff --git a/modules/gltf/doc_classes/GLTFState.xml b/modules/gltf/doc_classes/GLTFState.xml
index d0740cf7ca..9a554a0d49 100644
--- a/modules/gltf/doc_classes/GLTFState.xml
+++ b/modules/gltf/doc_classes/GLTFState.xml
@@ -66,7 +66,7 @@
 			</description>
 		</method>
 		<method name="get_materials">
-			<return type="BaseMaterial3D[]" />
+			<return type="Material[]" />
 			<description>
 			</description>
 		</method>
@@ -169,7 +169,7 @@
 		</method>
 		<method name="set_materials">
 			<return type="void" />
-			<param index="0" name="materials" type="BaseMaterial3D[]" />
+			<param index="0" name="materials" type="Material[]" />
 			<description>
 			</description>
 		</method>
diff --git a/modules/gltf/gltf_document.cpp b/modules/gltf/gltf_document.cpp
index f27e2385c6..eb8f7e5ebc 100644
--- a/modules/gltf/gltf_document.cpp
+++ b/modules/gltf/gltf_document.cpp
@@ -2484,12 +2484,12 @@ Error GLTFDocument::_serialize_meshes(Ref<GLTFState> state) {
 			if (surface_i < instance_materials.size()) {
 				v = instance_materials.get(surface_i);
 			}
-			Ref<BaseMaterial3D> mat = v;
+			Ref<Material> mat = v;
 			if (!mat.is_valid()) {
 				mat = import_mesh->get_surface_material(surface_i);
 			}
 			if (mat.is_valid()) {
-				HashMap<Ref<BaseMaterial3D>, GLTFMaterialIndex>::Iterator material_cache_i = state->material_cache.find(mat);
+				HashMap<Ref<Material>, GLTFMaterialIndex>::Iterator material_cache_i = state->material_cache.find(mat);
 				if (material_cache_i && material_cache_i->value != -1) {
 					primitive["material"] = material_cache_i->value;
 				} else {
@@ -2937,16 +2937,18 @@ Error GLTFDocument::_parse_meshes(Ref<GLTFState> state) {
 				}
 			}
 
-			Ref<BaseMaterial3D> mat;
+			Ref<Material> mat;
 			String mat_name;
 			if (!state->discard_meshes_and_materials) {
 				if (p.has("material")) {
 					const int material = p["material"];
 					ERR_FAIL_INDEX_V(material, state->materials.size(), ERR_FILE_CORRUPT);
-					Ref<BaseMaterial3D> mat3d = state->materials[material];
+					Ref<Material> mat3d = state->materials[material];
 					ERR_FAIL_NULL_V(mat3d, ERR_FILE_CORRUPT);
-					if (has_vertex_color) {
-						mat3d->set_flag(BaseMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+
+					Ref<BaseMaterial3D> base_material = mat3d;
+					if (has_vertex_color && base_material.is_valid()) {
+						base_material->set_flag(BaseMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
 					}
 					mat = mat3d;
 
@@ -2954,7 +2956,7 @@ Error GLTFDocument::_parse_meshes(Ref<GLTFState> state) {
 					Ref<StandardMaterial3D> mat3d;
 					mat3d.instantiate();
 					if (has_vertex_color) {
-						mat3d->set_flag(BaseMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+						mat3d->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
 					}
 					mat = mat3d;
 				}
@@ -3382,8 +3384,7 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 	Array materials;
 	for (int32_t i = 0; i < state->materials.size(); i++) {
 		Dictionary d;
-
-		Ref<BaseMaterial3D> material = state->materials[i];
+		Ref<Material> material = state->materials[i];
 		if (material.is_null()) {
 			materials.push_back(d);
 			continue;
@@ -3391,11 +3392,12 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 		if (!material->get_name().is_empty()) {
 			d["name"] = _gen_unique_name(state, material->get_name());
 		}
-		{
+		Ref<BaseMaterial3D> base_material = material;
+		if (base_material.is_valid()) {
 			Dictionary mr;
 			{
 				Array arr;
-				const Color c = material->get_albedo().srgb_to_linear();
+				const Color c = base_material->get_albedo().srgb_to_linear();
 				arr.push_back(c.r);
 				arr.push_back(c.g);
 				arr.push_back(c.b);
@@ -3404,167 +3406,169 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 			}
 			{
 				Dictionary bct;
-				Ref<Texture2D> albedo_texture = material->get_texture(BaseMaterial3D::TEXTURE_ALBEDO);
-				GLTFTextureIndex gltf_texture_index = -1;
+				if (base_material.is_valid()) {
+					Ref<Texture2D> albedo_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_ALBEDO);
+					GLTFTextureIndex gltf_texture_index = -1;
 
-				if (albedo_texture.is_valid() && albedo_texture->get_image().is_valid()) {
-					albedo_texture->set_name(material->get_name() + "_albedo");
-					gltf_texture_index = _set_texture(state, albedo_texture, material->get_texture_filter(), material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
-				}
-				if (gltf_texture_index != -1) {
-					bct["index"] = gltf_texture_index;
-					Dictionary extensions = _serialize_texture_transform_uv1(material);
-					if (!extensions.is_empty()) {
-						bct["extensions"] = extensions;
-						state->use_khr_texture_transform = true;
+					if (albedo_texture.is_valid() && albedo_texture->get_image().is_valid()) {
+						albedo_texture->set_name(material->get_name() + "_albedo");
+						gltf_texture_index = _set_texture(state, albedo_texture, base_material->get_texture_filter(), base_material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
+					}
+					if (gltf_texture_index != -1) {
+						bct["index"] = gltf_texture_index;
+						Dictionary extensions = _serialize_texture_transform_uv1(material);
+						if (!extensions.is_empty()) {
+							bct["extensions"] = extensions;
+							state->use_khr_texture_transform = true;
+						}
+						mr["baseColorTexture"] = bct;
 					}
-					mr["baseColorTexture"] = bct;
 				}
 			}
-
-			mr["metallicFactor"] = material->get_metallic();
-			mr["roughnessFactor"] = material->get_roughness();
-			bool has_roughness = material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS).is_valid() && material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS)->get_image().is_valid();
-			bool has_ao = material->get_feature(BaseMaterial3D::FEATURE_AMBIENT_OCCLUSION) && material->get_texture(BaseMaterial3D::TEXTURE_AMBIENT_OCCLUSION).is_valid();
-			bool has_metalness = material->get_texture(BaseMaterial3D::TEXTURE_METALLIC).is_valid() && material->get_texture(BaseMaterial3D::TEXTURE_METALLIC)->get_image().is_valid();
-			if (has_ao || has_roughness || has_metalness) {
-				Dictionary mrt;
-				Ref<Texture2D> roughness_texture = material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS);
-				BaseMaterial3D::TextureChannel roughness_channel = material->get_roughness_texture_channel();
-				Ref<Texture2D> metallic_texture = material->get_texture(BaseMaterial3D::TEXTURE_METALLIC);
-				BaseMaterial3D::TextureChannel metalness_channel = material->get_metallic_texture_channel();
-				Ref<Texture2D> ao_texture = material->get_texture(BaseMaterial3D::TEXTURE_AMBIENT_OCCLUSION);
-				BaseMaterial3D::TextureChannel ao_channel = material->get_ao_texture_channel();
-				Ref<ImageTexture> orm_texture;
-				orm_texture.instantiate();
-				Ref<Image> orm_image;
-				orm_image.instantiate();
-				int32_t height = 0;
-				int32_t width = 0;
-				Ref<Image> ao_image;
-				if (has_ao) {
-					height = ao_texture->get_height();
-					width = ao_texture->get_width();
-					ao_image = ao_texture->get_image();
-					Ref<ImageTexture> img_tex = ao_image;
-					if (img_tex.is_valid()) {
-						ao_image = img_tex->get_image();
+			if (base_material.is_valid()) {
+				mr["metallicFactor"] = base_material->get_metallic();
+				mr["roughnessFactor"] = base_material->get_roughness();
+				bool has_roughness = base_material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS).is_valid() && base_material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS)->get_image().is_valid();
+				bool has_ao = base_material->get_feature(BaseMaterial3D::FEATURE_AMBIENT_OCCLUSION) && base_material->get_texture(BaseMaterial3D::TEXTURE_AMBIENT_OCCLUSION).is_valid();
+				bool has_metalness = base_material->get_texture(BaseMaterial3D::TEXTURE_METALLIC).is_valid() && base_material->get_texture(BaseMaterial3D::TEXTURE_METALLIC)->get_image().is_valid();
+				if (has_ao || has_roughness || has_metalness) {
+					Dictionary mrt;
+					Ref<Texture2D> roughness_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_ROUGHNESS);
+					BaseMaterial3D::TextureChannel roughness_channel = base_material->get_roughness_texture_channel();
+					Ref<Texture2D> metallic_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_METALLIC);
+					BaseMaterial3D::TextureChannel metalness_channel = base_material->get_metallic_texture_channel();
+					Ref<Texture2D> ao_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_AMBIENT_OCCLUSION);
+					BaseMaterial3D::TextureChannel ao_channel = base_material->get_ao_texture_channel();
+					Ref<ImageTexture> orm_texture;
+					orm_texture.instantiate();
+					Ref<Image> orm_image;
+					orm_image.instantiate();
+					int32_t height = 0;
+					int32_t width = 0;
+					Ref<Image> ao_image;
+					if (has_ao) {
+						height = ao_texture->get_height();
+						width = ao_texture->get_width();
+						ao_image = ao_texture->get_image();
+						Ref<ImageTexture> img_tex = ao_image;
+						if (img_tex.is_valid()) {
+							ao_image = img_tex->get_image();
+						}
+						if (ao_image->is_compressed()) {
+							ao_image->decompress();
+						}
 					}
-					if (ao_image->is_compressed()) {
-						ao_image->decompress();
+					Ref<Image> roughness_image;
+					if (has_roughness) {
+						height = roughness_texture->get_height();
+						width = roughness_texture->get_width();
+						roughness_image = roughness_texture->get_image();
+						Ref<ImageTexture> img_tex = roughness_image;
+						if (img_tex.is_valid()) {
+							roughness_image = img_tex->get_image();
+						}
+						if (roughness_image->is_compressed()) {
+							roughness_image->decompress();
+						}
 					}
-				}
-				Ref<Image> roughness_image;
-				if (has_roughness) {
-					height = roughness_texture->get_height();
-					width = roughness_texture->get_width();
-					roughness_image = roughness_texture->get_image();
-					Ref<ImageTexture> img_tex = roughness_image;
-					if (img_tex.is_valid()) {
-						roughness_image = img_tex->get_image();
+					Ref<Image> metallness_image;
+					if (has_metalness) {
+						height = metallic_texture->get_height();
+						width = metallic_texture->get_width();
+						metallness_image = metallic_texture->get_image();
+						Ref<ImageTexture> img_tex = metallness_image;
+						if (img_tex.is_valid()) {
+							metallness_image = img_tex->get_image();
+						}
+						if (metallness_image->is_compressed()) {
+							metallness_image->decompress();
+						}
 					}
-					if (roughness_image->is_compressed()) {
-						roughness_image->decompress();
+					Ref<Texture2D> albedo_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_ALBEDO);
+					if (albedo_texture.is_valid() && albedo_texture->get_image().is_valid()) {
+						height = albedo_texture->get_height();
+						width = albedo_texture->get_width();
 					}
-				}
-				Ref<Image> metallness_image;
-				if (has_metalness) {
-					height = metallic_texture->get_height();
-					width = metallic_texture->get_width();
-					metallness_image = metallic_texture->get_image();
-					Ref<ImageTexture> img_tex = metallness_image;
-					if (img_tex.is_valid()) {
-						metallness_image = img_tex->get_image();
+					orm_image->initialize_data(width, height, false, Image::FORMAT_RGBA8);
+					if (ao_image.is_valid() && ao_image->get_size() != Vector2(width, height)) {
+						ao_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
 					}
-					if (metallness_image->is_compressed()) {
-						metallness_image->decompress();
+					if (roughness_image.is_valid() && roughness_image->get_size() != Vector2(width, height)) {
+						roughness_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
 					}
-				}
-				Ref<Texture2D> albedo_texture = material->get_texture(BaseMaterial3D::TEXTURE_ALBEDO);
-				if (albedo_texture.is_valid() && albedo_texture->get_image().is_valid()) {
-					height = albedo_texture->get_height();
-					width = albedo_texture->get_width();
-				}
-				orm_image->initialize_data(width, height, false, Image::FORMAT_RGBA8);
-				if (ao_image.is_valid() && ao_image->get_size() != Vector2(width, height)) {
-					ao_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
-				}
-				if (roughness_image.is_valid() && roughness_image->get_size() != Vector2(width, height)) {
-					roughness_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
-				}
-				if (metallness_image.is_valid() && metallness_image->get_size() != Vector2(width, height)) {
-					metallness_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
-				}
-				for (int32_t h = 0; h < height; h++) {
-					for (int32_t w = 0; w < width; w++) {
-						Color c = Color(1.0f, 1.0f, 1.0f);
-						if (has_ao) {
-							if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == ao_channel) {
-								c.r = ao_image->get_pixel(w, h).r;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == ao_channel) {
-								c.r = ao_image->get_pixel(w, h).g;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == ao_channel) {
-								c.r = ao_image->get_pixel(w, h).b;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == ao_channel) {
-								c.r = ao_image->get_pixel(w, h).a;
+					if (metallness_image.is_valid() && metallness_image->get_size() != Vector2(width, height)) {
+						metallness_image->resize(width, height, Image::INTERPOLATE_LANCZOS);
+					}
+					for (int32_t h = 0; h < height; h++) {
+						for (int32_t w = 0; w < width; w++) {
+							Color c = Color(1.0f, 1.0f, 1.0f);
+							if (has_ao) {
+								if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == ao_channel) {
+									c.r = ao_image->get_pixel(w, h).r;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == ao_channel) {
+									c.r = ao_image->get_pixel(w, h).g;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == ao_channel) {
+									c.r = ao_image->get_pixel(w, h).b;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == ao_channel) {
+									c.r = ao_image->get_pixel(w, h).a;
+								}
 							}
-						}
-						if (has_roughness) {
-							if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == roughness_channel) {
-								c.g = roughness_image->get_pixel(w, h).r;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == roughness_channel) {
-								c.g = roughness_image->get_pixel(w, h).g;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == roughness_channel) {
-								c.g = roughness_image->get_pixel(w, h).b;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == roughness_channel) {
-								c.g = roughness_image->get_pixel(w, h).a;
+							if (has_roughness) {
+								if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == roughness_channel) {
+									c.g = roughness_image->get_pixel(w, h).r;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == roughness_channel) {
+									c.g = roughness_image->get_pixel(w, h).g;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == roughness_channel) {
+									c.g = roughness_image->get_pixel(w, h).b;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == roughness_channel) {
+									c.g = roughness_image->get_pixel(w, h).a;
+								}
 							}
-						}
-						if (has_metalness) {
-							if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == metalness_channel) {
-								c.b = metallness_image->get_pixel(w, h).r;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == metalness_channel) {
-								c.b = metallness_image->get_pixel(w, h).g;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == metalness_channel) {
-								c.b = metallness_image->get_pixel(w, h).b;
-							} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == metalness_channel) {
-								c.b = metallness_image->get_pixel(w, h).a;
+							if (has_metalness) {
+								if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_RED == metalness_channel) {
+									c.b = metallness_image->get_pixel(w, h).r;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_GREEN == metalness_channel) {
+									c.b = metallness_image->get_pixel(w, h).g;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_BLUE == metalness_channel) {
+									c.b = metallness_image->get_pixel(w, h).b;
+								} else if (BaseMaterial3D::TextureChannel::TEXTURE_CHANNEL_ALPHA == metalness_channel) {
+									c.b = metallness_image->get_pixel(w, h).a;
+								}
 							}
+							orm_image->set_pixel(w, h, c);
 						}
-						orm_image->set_pixel(w, h, c);
 					}
-				}
-				orm_image->generate_mipmaps();
-				orm_texture->set_image(orm_image);
-				GLTFTextureIndex orm_texture_index = -1;
-				if (has_ao || has_roughness || has_metalness) {
-					orm_texture->set_name(material->get_name() + "_orm");
-					orm_texture_index = _set_texture(state, orm_texture, material->get_texture_filter(), material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
-				}
-				if (has_ao) {
-					Dictionary occt;
-					occt["index"] = orm_texture_index;
-					d["occlusionTexture"] = occt;
-				}
-				if (has_roughness || has_metalness) {
-					mrt["index"] = orm_texture_index;
-					Dictionary extensions = _serialize_texture_transform_uv1(material);
-					if (!extensions.is_empty()) {
-						mrt["extensions"] = extensions;
-						state->use_khr_texture_transform = true;
+					orm_image->generate_mipmaps();
+					orm_texture->set_image(orm_image);
+					GLTFTextureIndex orm_texture_index = -1;
+					if (has_ao || has_roughness || has_metalness) {
+						orm_texture->set_name(material->get_name() + "_orm");
+						orm_texture_index = _set_texture(state, orm_texture, base_material->get_texture_filter(), base_material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
+					}
+					if (has_ao) {
+						Dictionary occt;
+						occt["index"] = orm_texture_index;
+						d["occlusionTexture"] = occt;
+					}
+					if (has_roughness || has_metalness) {
+						mrt["index"] = orm_texture_index;
+						Dictionary extensions = _serialize_texture_transform_uv1(material);
+						if (!extensions.is_empty()) {
+							mrt["extensions"] = extensions;
+							state->use_khr_texture_transform = true;
+						}
+						mr["metallicRoughnessTexture"] = mrt;
 					}
-					mr["metallicRoughnessTexture"] = mrt;
 				}
 			}
 			d["pbrMetallicRoughness"] = mr;
 		}
-
-		if (material->get_feature(BaseMaterial3D::FEATURE_NORMAL_MAPPING)) {
+		if (base_material->get_feature(BaseMaterial3D::FEATURE_NORMAL_MAPPING)) {
 			Dictionary nt;
 			Ref<ImageTexture> tex;
 			tex.instantiate();
 			{
-				Ref<Texture2D> normal_texture = material->get_texture(BaseMaterial3D::TEXTURE_NORMAL);
+				Ref<Texture2D> normal_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_NORMAL);
 				if (normal_texture.is_valid()) {
 					// Code for uncompressing RG normal maps
 					Ref<Image> img = normal_texture->get_image();
@@ -3594,30 +3598,30 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 			GLTFTextureIndex gltf_texture_index = -1;
 			if (tex.is_valid() && tex->get_image().is_valid()) {
 				tex->set_name(material->get_name() + "_normal");
-				gltf_texture_index = _set_texture(state, tex, material->get_texture_filter(), material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
+				gltf_texture_index = _set_texture(state, tex, base_material->get_texture_filter(), base_material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
 			}
-			nt["scale"] = material->get_normal_scale();
+			nt["scale"] = base_material->get_normal_scale();
 			if (gltf_texture_index != -1) {
 				nt["index"] = gltf_texture_index;
 				d["normalTexture"] = nt;
 			}
 		}
 
-		if (material->get_feature(BaseMaterial3D::FEATURE_EMISSION)) {
-			const Color c = material->get_emission().linear_to_srgb();
+		if (base_material->get_feature(BaseMaterial3D::FEATURE_EMISSION)) {
+			const Color c = base_material->get_emission().linear_to_srgb();
 			Array arr;
 			arr.push_back(c.r);
 			arr.push_back(c.g);
 			arr.push_back(c.b);
 			d["emissiveFactor"] = arr;
 		}
-		if (material->get_feature(BaseMaterial3D::FEATURE_EMISSION)) {
+		if (base_material->get_feature(BaseMaterial3D::FEATURE_EMISSION)) {
 			Dictionary et;
-			Ref<Texture2D> emission_texture = material->get_texture(BaseMaterial3D::TEXTURE_EMISSION);
+			Ref<Texture2D> emission_texture = base_material->get_texture(BaseMaterial3D::TEXTURE_EMISSION);
 			GLTFTextureIndex gltf_texture_index = -1;
 			if (emission_texture.is_valid() && emission_texture->get_image().is_valid()) {
 				emission_texture->set_name(material->get_name() + "_emission");
-				gltf_texture_index = _set_texture(state, emission_texture, material->get_texture_filter(), material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
+				gltf_texture_index = _set_texture(state, emission_texture, base_material->get_texture_filter(), base_material->get_flag(BaseMaterial3D::FLAG_USE_TEXTURE_REPEAT));
 			}
 
 			if (gltf_texture_index != -1) {
@@ -3625,14 +3629,14 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 				d["emissiveTexture"] = et;
 			}
 		}
-		const bool ds = material->get_cull_mode() == BaseMaterial3D::CULL_DISABLED;
+		const bool ds = base_material->get_cull_mode() == BaseMaterial3D::CULL_DISABLED;
 		if (ds) {
 			d["doubleSided"] = ds;
 		}
-		if (material->get_transparency() == BaseMaterial3D::TRANSPARENCY_ALPHA_SCISSOR) {
+		if (base_material->get_transparency() == BaseMaterial3D::TRANSPARENCY_ALPHA_SCISSOR) {
 			d["alphaMode"] = "MASK";
-			d["alphaCutoff"] = material->get_alpha_scissor_threshold();
-		} else if (material->get_transparency() != BaseMaterial3D::TRANSPARENCY_DISABLED) {
+			d["alphaCutoff"] = base_material->get_alpha_scissor_threshold();
+		} else if (base_material->get_transparency() != BaseMaterial3D::TRANSPARENCY_DISABLED) {
 			d["alphaMode"] = "BLEND";
 		}
 		materials.push_back(d);
@@ -3838,29 +3842,37 @@ void GLTFDocument::_set_texture_transform_uv1(const Dictionary &d, Ref<BaseMater
 	if (d.has("extensions")) {
 		const Dictionary &extensions = d["extensions"];
 		if (extensions.has("KHR_texture_transform")) {
-			const Dictionary &texture_transform = extensions["KHR_texture_transform"];
-			const Array &offset_arr = texture_transform["offset"];
-			if (offset_arr.size() == 2) {
-				const Vector3 offset_vector3 = Vector3(offset_arr[0], offset_arr[1], 0.0f);
-				material->set_uv1_offset(offset_vector3);
-			}
+			if (material.is_valid()) {
+				const Dictionary &texture_transform = extensions["KHR_texture_transform"];
+				const Array &offset_arr = texture_transform["offset"];
+				if (offset_arr.size() == 2) {
+					const Vector3 offset_vector3 = Vector3(offset_arr[0], offset_arr[1], 0.0f);
+					material->set_uv1_offset(offset_vector3);
+				}
 
-			const Array &scale_arr = texture_transform["scale"];
-			if (scale_arr.size() == 2) {
-				const Vector3 scale_vector3 = Vector3(scale_arr[0], scale_arr[1], 1.0f);
-				material->set_uv1_scale(scale_vector3);
+				const Array &scale_arr = texture_transform["scale"];
+				if (scale_arr.size() == 2) {
+					const Vector3 scale_vector3 = Vector3(scale_arr[0], scale_arr[1], 1.0f);
+					material->set_uv1_scale(scale_vector3);
+				}
 			}
 		}
 	}
 }
 
 void GLTFDocument::spec_gloss_to_rough_metal(Ref<GLTFSpecGloss> r_spec_gloss, Ref<BaseMaterial3D> p_material) {
+	if (r_spec_gloss.is_null()) {
+		return;
+	}
 	if (r_spec_gloss->spec_gloss_img.is_null()) {
 		return;
 	}
 	if (r_spec_gloss->diffuse_img.is_null()) {
 		return;
 	}
+	if (p_material.is_null()) {
+		return;
+	}
 	bool has_roughness = false;
 	bool has_metal = false;
 	p_material->set_roughness(1.0f);
@@ -6657,21 +6669,17 @@ Dictionary _serialize_texture_transform_uv(Vector2 p_offset, Vector2 p_scale) {
 }
 
 Dictionary GLTFDocument::_serialize_texture_transform_uv1(Ref<BaseMaterial3D> p_material) {
-	if (p_material.is_valid()) {
-		Vector3 offset = p_material->get_uv1_offset();
-		Vector3 scale = p_material->get_uv1_scale();
-		return _serialize_texture_transform_uv(Vector2(offset.x, offset.y), Vector2(scale.x, scale.y));
-	}
-	return Dictionary();
+	ERR_FAIL_NULL_V(p_material, Dictionary());
+	Vector3 offset = p_material->get_uv1_offset();
+	Vector3 scale = p_material->get_uv1_scale();
+	return _serialize_texture_transform_uv(Vector2(offset.x, offset.y), Vector2(scale.x, scale.y));
 }
 
 Dictionary GLTFDocument::_serialize_texture_transform_uv2(Ref<BaseMaterial3D> p_material) {
-	if (p_material.is_valid()) {
-		Vector3 offset = p_material->get_uv2_offset();
-		Vector3 scale = p_material->get_uv2_scale();
-		return _serialize_texture_transform_uv(Vector2(offset.x, offset.y), Vector2(scale.x, scale.y));
-	}
-	return Dictionary();
+	ERR_FAIL_NULL_V(p_material, Dictionary());
+	Vector3 offset = p_material->get_uv2_offset();
+	Vector3 scale = p_material->get_uv2_scale();
+	return _serialize_texture_transform_uv(Vector2(offset.x, offset.y), Vector2(scale.x, scale.y));
 }
 
 Error GLTFDocument::_serialize_version(Ref<GLTFState> state) {
diff --git a/modules/gltf/gltf_state.cpp b/modules/gltf/gltf_state.cpp
index ac5665e396..6654c9e5d2 100644
--- a/modules/gltf/gltf_state.cpp
+++ b/modules/gltf/gltf_state.cpp
@@ -209,11 +209,11 @@ void GLTFState::set_meshes(TypedArray<GLTFMesh> p_meshes) {
 	GLTFTemplateConvert::set_from_array(meshes, p_meshes);
 }
 
-TypedArray<BaseMaterial3D> GLTFState::get_materials() {
+TypedArray<Material> GLTFState::get_materials() {
 	return GLTFTemplateConvert::to_array(materials);
 }
 
-void GLTFState::set_materials(TypedArray<BaseMaterial3D> p_materials) {
+void GLTFState::set_materials(TypedArray<Material> p_materials) {
 	GLTFTemplateConvert::set_from_array(materials, p_materials);
 }
 
diff --git a/modules/gltf/gltf_state.h b/modules/gltf/gltf_state.h
index e24017b0fd..1c20520b22 100644
--- a/modules/gltf/gltf_state.h
+++ b/modules/gltf/gltf_state.h
@@ -72,8 +72,8 @@ class GLTFState : public Resource {
 	Vector<Ref<GLTFMesh>> meshes; // meshes are loaded directly, no reason not to.
 
 	Vector<AnimationPlayer *> animation_players;
-	HashMap<Ref<BaseMaterial3D>, GLTFMaterialIndex> material_cache;
-	Vector<Ref<BaseMaterial3D>> materials;
+	HashMap<Ref<Material>, GLTFMaterialIndex> material_cache;
+	Vector<Ref<Material>> materials;
 
 	String scene_name;
 	Vector<int> root_nodes;
@@ -138,8 +138,8 @@ public:
 	TypedArray<GLTFMesh> get_meshes();
 	void set_meshes(TypedArray<GLTFMesh> p_meshes);
 
-	TypedArray<BaseMaterial3D> get_materials();
-	void set_materials(TypedArray<BaseMaterial3D> p_materials);
+	TypedArray<Material> get_materials();
+	void set_materials(TypedArray<Material> p_materials);
 
 	String get_scene_name();
 	void set_scene_name(String p_scene_name);
diff --git a/modules/gridmap/editor/grid_map_editor_plugin.cpp b/modules/gridmap/editor/grid_map_editor_plugin.cpp
index 9c6cbebf0e..c8aedc8b92 100644
--- a/modules/gridmap/editor/grid_map_editor_plugin.cpp
+++ b/modules/gridmap/editor/grid_map_editor_plugin.cpp
@@ -459,6 +459,7 @@ void GridMapEditor::_delete_selection() {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("GridMap Delete Selection"));
 	for (int i = selection.begin.x; i <= selection.end.x; i++) {
 		for (int j = selection.begin.y; j <= selection.end.y; j++) {
@@ -479,6 +480,7 @@ void GridMapEditor::_fill_selection() {
 		return;
 	}
 
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("GridMap Fill Selection"));
 	for (int i = selection.begin.x; i <= selection.end.x; i++) {
 		for (int j = selection.begin.y; j <= selection.end.y; j++) {
@@ -572,6 +574,7 @@ void GridMapEditor::_do_paste() {
 	rot = node->get_basis_with_orthogonal_index(paste_indicator.orientation);
 
 	Vector3 ofs = paste_indicator.current - paste_indicator.click;
+	Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 	undo_redo->create_action(TTR("GridMap Paste Selection"));
 
 	for (const ClipboardItem &item : clipboard_items) {
@@ -659,6 +662,7 @@ EditorPlugin::AfterGUIInput GridMapEditor::forward_spatial_input_event(Camera3D
 		} else {
 			if ((mb->get_button_index() == MouseButton::RIGHT && input_action == INPUT_ERASE) || (mb->get_button_index() == MouseButton::LEFT && input_action == INPUT_PAINT)) {
 				if (set_items.size()) {
+					Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 					undo_redo->create_action(TTR("GridMap Paint"));
 					for (const SetItem &si : set_items) {
 						undo_redo->add_do_method(node, "set_cell_item", si.position, si.new_value, si.new_orientation);
@@ -680,6 +684,7 @@ EditorPlugin::AfterGUIInput GridMapEditor::forward_spatial_input_event(Camera3D
 			}
 
 			if (mb->get_button_index() == MouseButton::LEFT && input_action == INPUT_SELECT) {
+				Ref<EditorUndoRedoManager> &undo_redo = EditorNode::get_undo_redo();
 				undo_redo->create_action(TTR("GridMap Selection"));
 				undo_redo->add_do_method(this, "_set_selection", selection.active, selection.begin, selection.end);
 				undo_redo->add_undo_method(this, "_set_selection", last_selection.active, last_selection.begin, last_selection.end);
@@ -1142,8 +1147,6 @@ void GridMapEditor::_bind_methods() {
 }
 
 GridMapEditor::GridMapEditor() {
-	undo_redo = EditorNode::get_singleton()->get_undo_redo();
-
 	int mw = EDITOR_DEF("editors/grid_map/palette_min_width", 230);
 	Control *ec = memnew(Control);
 	ec->set_custom_minimum_size(Size2(mw, 0) * EDSCALE);
diff --git a/modules/gridmap/editor/grid_map_editor_plugin.h b/modules/gridmap/editor/grid_map_editor_plugin.h
index 91f14690ca..1cf2e4cb89 100644
--- a/modules/gridmap/editor/grid_map_editor_plugin.h
+++ b/modules/gridmap/editor/grid_map_editor_plugin.h
@@ -41,7 +41,6 @@
 #include "scene/gui/spin_box.h"
 
 class ConfirmationDialog;
-class EditorUndoRedoManager;
 class MenuButton;
 class Node3DEditorPlugin;
 
@@ -66,7 +65,6 @@ class GridMapEditor : public VBoxContainer {
 		DISPLAY_LIST
 	};
 
-	Ref<EditorUndoRedoManager> undo_redo;
 	InputAction input_action = INPUT_NONE;
 	Panel *panel = nullptr;
 	MenuButton *options = nullptr;
diff --git a/modules/mono/csharp_script.cpp b/modules/mono/csharp_script.cpp
index 345d2e4694..a4bffc1e3c 100644
--- a/modules/mono/csharp_script.cpp
+++ b/modules/mono/csharp_script.cpp
@@ -710,6 +710,12 @@ void CSharpLanguage::reload_assemblies(bool p_soft_reload) {
 		return;
 	}
 
+	if (!Engine::get_singleton()->is_editor_hint()) {
+		// We disable collectible assemblies in the game player, because the limitations cause
+		// issues with mocking libraries. As such, we can only reload assemblies in the editor.
+		return;
+	}
+
 	// TODO:
 	//  Currently, this reloads all scripts, including those whose class is not part of the
 	//  assembly load context being unloaded. As such, we unnecessarily reload GodotTools.
diff --git a/modules/mono/editor/bindings_generator.cpp b/modules/mono/editor/bindings_generator.cpp
index b90321b586..82b5f478e1 100644
--- a/modules/mono/editor/bindings_generator.cpp
+++ b/modules/mono/editor/bindings_generator.cpp
@@ -2274,7 +2274,7 @@ Error BindingsGenerator::_generate_cs_signal(const BindingsGenerator::TypeInterf
 			p_output.append(");\n");
 
 			// Generate Callable trampoline for the delegate
-			p_output << MEMBER_BEGIN "private static unsafe void " << p_isignal.proxy_name << "Trampoline"
+			p_output << MEMBER_BEGIN "private static void " << p_isignal.proxy_name << "Trampoline"
 					 << "(object delegateObj, NativeVariantPtrArgs args, out godot_variant ret)\n"
 					 << INDENT1 "{\n"
 					 << INDENT2 "Callable.ThrowIfArgCountMismatch(args, " << itos(p_isignal.arguments.size()) << ");\n"
@@ -2289,9 +2289,8 @@ Error BindingsGenerator::_generate_cs_signal(const BindingsGenerator::TypeInterf
 					p_output << ",";
 				}
 
-				// TODO: We don't need to use VariantConversionCallbacks. We have the type information so we can use [cs_variant_to_managed] and [cs_managed_to_variant].
-				p_output << "\n" INDENT3 "VariantConversionCallbacks.GetToManagedCallback<"
-						 << arg_type->cs_type << ">()(args[" << itos(idx) << "])";
+				p_output << sformat(arg_type->cs_variant_to_managed,
+						"args[" + itos(idx) + "]", arg_type->cs_type, arg_type->name);
 
 				idx++;
 			}
diff --git a/modules/mono/glue/GodotSharp/GodotPlugins/Main.cs b/modules/mono/glue/GodotSharp/GodotPlugins/Main.cs
index 8308bada24..4ce02d221e 100644
--- a/modules/mono/glue/GodotSharp/GodotPlugins/Main.cs
+++ b/modules/mono/glue/GodotSharp/GodotPlugins/Main.cs
@@ -28,17 +28,24 @@ namespace GodotPlugins
                 get => _pluginLoadContext?.AssemblyLoadedPath;
             }
 
+            public bool IsCollectible
+            {
+                [MethodImpl(MethodImplOptions.NoInlining)]
+                get => _pluginLoadContext?.IsCollectible ?? false;
+            }
+
             [MethodImpl(MethodImplOptions.NoInlining)]
             public static (Assembly, PluginLoadContextWrapper) CreateAndLoadFromAssemblyName(
                 AssemblyName assemblyName,
                 string pluginPath,
                 ICollection<string> sharedAssemblies,
-                AssemblyLoadContext mainLoadContext
+                AssemblyLoadContext mainLoadContext,
+                bool isCollectible
             )
             {
                 var wrapper = new PluginLoadContextWrapper();
                 wrapper._pluginLoadContext = new PluginLoadContext(
-                    pluginPath, sharedAssemblies, mainLoadContext);
+                    pluginPath, sharedAssemblies, mainLoadContext, isCollectible);
                 var assembly = wrapper._pluginLoadContext.LoadFromAssemblyName(assemblyName);
                 return (assembly, wrapper);
             }
@@ -61,6 +68,7 @@ namespace GodotPlugins
         private static readonly Assembly CoreApiAssembly = typeof(Godot.Object).Assembly;
         private static Assembly? _editorApiAssembly;
         private static PluginLoadContextWrapper? _projectLoadContext;
+        private static bool _editorHint = false;
 
         private static readonly AssemblyLoadContext MainLoadContext =
             AssemblyLoadContext.GetLoadContext(Assembly.GetExecutingAssembly()) ??
@@ -77,15 +85,17 @@ namespace GodotPlugins
         {
             try
             {
+                _editorHint = editorHint.ToBool();
+
                 _dllImportResolver = new GodotDllImportResolver(godotDllHandle).OnResolveDllImport;
 
                 SharedAssemblies.Add(CoreApiAssembly.GetName());
                 NativeLibrary.SetDllImportResolver(CoreApiAssembly, _dllImportResolver);
 
-                AlcReloadCfg.Configure(alcReloadEnabled: editorHint.ToBool());
+                AlcReloadCfg.Configure(alcReloadEnabled: _editorHint);
                 NativeFuncs.Initialize(unmanagedCallbacks, unmanagedCallbacksSize);
 
-                if (editorHint.ToBool())
+                if (_editorHint)
                 {
                     _editorApiAssembly = Assembly.Load("GodotSharpEditor");
                     SharedAssemblies.Add(_editorApiAssembly.GetName());
@@ -128,7 +138,7 @@ namespace GodotPlugins
 
                 string assemblyPath = new(nAssemblyPath);
 
-                (var projectAssembly, _projectLoadContext) = LoadPlugin(assemblyPath);
+                (var projectAssembly, _projectLoadContext) = LoadPlugin(assemblyPath, isCollectible: _editorHint);
 
                 string loadedAssemblyPath = _projectLoadContext.AssemblyLoadedPath ?? assemblyPath;
                 *outLoadedAssemblyPath = Marshaling.ConvertStringToNative(loadedAssemblyPath);
@@ -155,7 +165,7 @@ namespace GodotPlugins
                 if (_editorApiAssembly == null)
                     throw new InvalidOperationException("The Godot editor API assembly is not loaded.");
 
-                var (assembly, _) = LoadPlugin(assemblyPath);
+                var (assembly, _) = LoadPlugin(assemblyPath, isCollectible: _editorHint);
 
                 NativeLibrary.SetDllImportResolver(assembly, _dllImportResolver!);
 
@@ -180,7 +190,7 @@ namespace GodotPlugins
             }
         }
 
-        private static (Assembly, PluginLoadContextWrapper) LoadPlugin(string assemblyPath)
+        private static (Assembly, PluginLoadContextWrapper) LoadPlugin(string assemblyPath, bool isCollectible)
         {
             string assemblyName = Path.GetFileNameWithoutExtension(assemblyPath);
 
@@ -194,7 +204,7 @@ namespace GodotPlugins
             }
 
             return PluginLoadContextWrapper.CreateAndLoadFromAssemblyName(
-                new AssemblyName(assemblyName), assemblyPath, sharedAssemblies, MainLoadContext);
+                new AssemblyName(assemblyName), assemblyPath, sharedAssemblies, MainLoadContext, isCollectible);
         }
 
         [UnmanagedCallersOnly]
@@ -218,6 +228,12 @@ namespace GodotPlugins
                 if (pluginLoadContext == null)
                     return true;
 
+                if (!pluginLoadContext.IsCollectible)
+                {
+                    Console.Error.WriteLine("Cannot unload a non-collectible assembly load context.");
+                    return false;
+                }
+
                 Console.WriteLine("Unloading assembly load context...");
 
                 var alcWeakReference = pluginLoadContext.CreateWeakReference();
diff --git a/modules/mono/glue/GodotSharp/GodotPlugins/PluginLoadContext.cs b/modules/mono/glue/GodotSharp/GodotPlugins/PluginLoadContext.cs
index dcd572c65e..344b76a202 100644
--- a/modules/mono/glue/GodotSharp/GodotPlugins/PluginLoadContext.cs
+++ b/modules/mono/glue/GodotSharp/GodotPlugins/PluginLoadContext.cs
@@ -15,8 +15,8 @@ namespace GodotPlugins
         public string? AssemblyLoadedPath { get; private set; }
 
         public PluginLoadContext(string pluginPath, ICollection<string> sharedAssemblies,
-            AssemblyLoadContext mainLoadContext)
-            : base(isCollectible: true)
+            AssemblyLoadContext mainLoadContext, bool isCollectible)
+            : base(isCollectible)
         {
             _resolver = new AssemblyDependencyResolver(pluginPath);
             _sharedAssemblies = sharedAssemblies;
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Array.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Array.cs
index f1b46e293b..e3b7ac297d 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Array.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Array.cs
@@ -495,35 +495,10 @@ namespace Godot.Collections
         private static Array<T> FromVariantFunc(in godot_variant variant) =>
             VariantUtils.ConvertToArrayObject<T>(variant);
 
-        // ReSharper disable StaticMemberInGenericType
-        // Warning is about unique static fields being created for each generic type combination:
-        // https://www.jetbrains.com/help/resharper/StaticMemberInGenericType.html
-        // In our case this is exactly what we want.
-
-        private static readonly unsafe delegate* managed<in T, godot_variant> ConvertToVariantCallback;
-        private static readonly unsafe delegate* managed<in godot_variant, T> ConvertToManagedCallback;
-
-        // ReSharper restore StaticMemberInGenericType
-
         static unsafe Array()
         {
-            VariantConversionCallbacks.GenericConversionCallbacks[typeof(Array<T>)] =
-            (
-                (IntPtr)(delegate* managed<in Array<T>, godot_variant>)&ToVariantFunc,
-                (IntPtr)(delegate* managed<in godot_variant, Array<T>>)&FromVariantFunc
-            );
-
-            ConvertToVariantCallback = VariantConversionCallbacks.GetToVariantCallback<T>();
-            ConvertToManagedCallback = VariantConversionCallbacks.GetToManagedCallback<T>();
-        }
-
-        private static unsafe void ValidateVariantConversionCallbacks()
-        {
-            if (ConvertToVariantCallback == null || ConvertToManagedCallback == null)
-            {
-                throw new InvalidOperationException(
-                    $"The array element type is not supported for conversion to Variant: '{typeof(T).FullName}'.");
-            }
+            VariantUtils.GenericConversion<Array<T>>.ToVariantCb = &ToVariantFunc;
+            VariantUtils.GenericConversion<Array<T>>.FromVariantCb = &FromVariantFunc;
         }
 
         private readonly Array _underlyingArray;
@@ -539,8 +514,6 @@ namespace Godot.Collections
         /// </summary>
         public Array()
         {
-            ValidateVariantConversionCallbacks();
-
             _underlyingArray = new Array();
         }
 
@@ -551,8 +524,6 @@ namespace Godot.Collections
         /// <returns>A new Godot Array.</returns>
         public Array(IEnumerable<T> collection)
         {
-            ValidateVariantConversionCallbacks();
-
             if (collection == null)
                 throw new ArgumentNullException(nameof(collection));
 
@@ -569,8 +540,6 @@ namespace Godot.Collections
         /// <returns>A new Godot Array.</returns>
         public Array(T[] array) : this()
         {
-            ValidateVariantConversionCallbacks();
-
             if (array == null)
                 throw new ArgumentNullException(nameof(array));
 
@@ -586,8 +555,6 @@ namespace Godot.Collections
         /// <param name="array">The untyped array to construct from.</param>
         public Array(Array array)
         {
-            ValidateVariantConversionCallbacks();
-
             _underlyingArray = array;
         }
 
@@ -665,7 +632,7 @@ namespace Godot.Collections
             get
             {
                 _underlyingArray.GetVariantBorrowElementAt(index, out godot_variant borrowElem);
-                return ConvertToManagedCallback(borrowElem);
+                return VariantUtils.ConvertTo<T>(borrowElem);
             }
             set
             {
@@ -675,7 +642,7 @@ namespace Godot.Collections
                 godot_variant* ptrw = NativeFuncs.godotsharp_array_ptrw(ref self);
                 godot_variant* itemPtr = &ptrw[index];
                 (*itemPtr).Dispose();
-                *itemPtr = ConvertToVariantCallback(value);
+                *itemPtr = VariantUtils.CreateFrom(value);
             }
         }
 
@@ -685,9 +652,9 @@ namespace Godot.Collections
         /// </summary>
         /// <param name="item">The item to search for.</param>
         /// <returns>The index of the item, or -1 if not found.</returns>
-        public unsafe int IndexOf(T item)
+        public int IndexOf(T item)
         {
-            using var variantValue = ConvertToVariantCallback(item);
+            using var variantValue = VariantUtils.CreateFrom(item);
             var self = (godot_array)_underlyingArray.NativeValue;
             return NativeFuncs.godotsharp_array_index_of(ref self, variantValue);
         }
@@ -700,12 +667,12 @@ namespace Godot.Collections
         /// </summary>
         /// <param name="index">The index to insert at.</param>
         /// <param name="item">The item to insert.</param>
-        public unsafe void Insert(int index, T item)
+        public void Insert(int index, T item)
         {
             if (index < 0 || index > Count)
                 throw new ArgumentOutOfRangeException(nameof(index));
 
-            using var variantValue = ConvertToVariantCallback(item);
+            using var variantValue = VariantUtils.CreateFrom(item);
             var self = (godot_array)_underlyingArray.NativeValue;
             NativeFuncs.godotsharp_array_insert(ref self, index, variantValue);
         }
@@ -736,9 +703,9 @@ namespace Godot.Collections
         /// </summary>
         /// <param name="item">The item to add.</param>
         /// <returns>The new size after adding the item.</returns>
-        public unsafe void Add(T item)
+        public void Add(T item)
         {
-            using var variantValue = ConvertToVariantCallback(item);
+            using var variantValue = VariantUtils.CreateFrom(item);
             var self = (godot_array)_underlyingArray.NativeValue;
             _ = NativeFuncs.godotsharp_array_add(ref self, variantValue);
         }
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Callable.generics.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Callable.generics.cs
index 6c6a104019..ff385da1c9 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Callable.generics.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Callable.generics.cs
@@ -54,7 +54,7 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 1);
 
             ((Action<T0>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0])
+                VariantUtils.ConvertTo<T0>(args[0])
             );
 
             ret = default;
@@ -73,8 +73,8 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 2);
 
             ((Action<T0, T1>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1])
             );
 
             ret = default;
@@ -93,9 +93,9 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 3);
 
             ((Action<T0, T1, T2>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2])
             );
 
             ret = default;
@@ -114,10 +114,10 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 4);
 
             ((Action<T0, T1, T2, T3>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3])
             );
 
             ret = default;
@@ -136,11 +136,11 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 5);
 
             ((Action<T0, T1, T2, T3, T4>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4])
             );
 
             ret = default;
@@ -159,12 +159,12 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 6);
 
             ((Action<T0, T1, T2, T3, T4, T5>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5])
             );
 
             ret = default;
@@ -183,13 +183,13 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 7);
 
             ((Action<T0, T1, T2, T3, T4, T5, T6>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6])
             );
 
             ret = default;
@@ -208,14 +208,14 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 8);
 
             ((Action<T0, T1, T2, T3, T4, T5, T6, T7>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6]),
-                VariantConversionCallbacks.GetToManagedCallback<T7>()(args[7])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6]),
+                VariantUtils.ConvertTo<T7>(args[7])
             );
 
             ret = default;
@@ -234,15 +234,15 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 9);
 
             ((Action<T0, T1, T2, T3, T4, T5, T6, T7, T8>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6]),
-                VariantConversionCallbacks.GetToManagedCallback<T7>()(args[7]),
-                VariantConversionCallbacks.GetToManagedCallback<T8>()(args[8])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6]),
+                VariantUtils.ConvertTo<T7>(args[7]),
+                VariantUtils.ConvertTo<T8>(args[8])
             );
 
             ret = default;
@@ -265,7 +265,7 @@ public readonly partial struct Callable
 
             TResult res = ((Func<TResult>)delegateObj)();
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -281,10 +281,10 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 1);
 
             TResult res = ((Func<T0, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0])
+                VariantUtils.ConvertTo<T0>(args[0])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -300,11 +300,11 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 2);
 
             TResult res = ((Func<T0, T1, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -320,12 +320,12 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 3);
 
             TResult res = ((Func<T0, T1, T2, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -341,13 +341,13 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 4);
 
             TResult res = ((Func<T0, T1, T2, T3, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -363,14 +363,14 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 5);
 
             TResult res = ((Func<T0, T1, T2, T3, T4, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -386,15 +386,15 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 6);
 
             TResult res = ((Func<T0, T1, T2, T3, T4, T5, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -410,16 +410,16 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 7);
 
             TResult res = ((Func<T0, T1, T2, T3, T4, T5, T6, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -435,17 +435,17 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 8);
 
             TResult res = ((Func<T0, T1, T2, T3, T4, T5, T6, T7, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6]),
-                VariantConversionCallbacks.GetToManagedCallback<T7>()(args[7])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6]),
+                VariantUtils.ConvertTo<T7>(args[7])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
@@ -461,18 +461,18 @@ public readonly partial struct Callable
             ThrowIfArgCountMismatch(args, 9);
 
             TResult res = ((Func<T0, T1, T2, T3, T4, T5, T6, T7, T8, TResult>)delegateObj)(
-                VariantConversionCallbacks.GetToManagedCallback<T0>()(args[0]),
-                VariantConversionCallbacks.GetToManagedCallback<T1>()(args[1]),
-                VariantConversionCallbacks.GetToManagedCallback<T2>()(args[2]),
-                VariantConversionCallbacks.GetToManagedCallback<T3>()(args[3]),
-                VariantConversionCallbacks.GetToManagedCallback<T4>()(args[4]),
-                VariantConversionCallbacks.GetToManagedCallback<T5>()(args[5]),
-                VariantConversionCallbacks.GetToManagedCallback<T6>()(args[6]),
-                VariantConversionCallbacks.GetToManagedCallback<T7>()(args[7]),
-                VariantConversionCallbacks.GetToManagedCallback<T8>()(args[8])
+                VariantUtils.ConvertTo<T0>(args[0]),
+                VariantUtils.ConvertTo<T1>(args[1]),
+                VariantUtils.ConvertTo<T2>(args[2]),
+                VariantUtils.ConvertTo<T3>(args[3]),
+                VariantUtils.ConvertTo<T4>(args[4]),
+                VariantUtils.ConvertTo<T5>(args[5]),
+                VariantUtils.ConvertTo<T6>(args[6]),
+                VariantUtils.ConvertTo<T7>(args[7]),
+                VariantUtils.ConvertTo<T8>(args[8])
             );
 
-            ret = VariantConversionCallbacks.GetToVariantCallback<TResult>()(res);
+            ret = VariantUtils.CreateFrom(res);
         }
 
         return CreateWithUnsafeTrampoline(func, &Trampoline);
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Dictionary.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Dictionary.cs
index f8793332a0..f14790a218 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Dictionary.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Dictionary.cs
@@ -362,45 +362,10 @@ namespace Godot.Collections
         private static Dictionary<TKey, TValue> FromVariantFunc(in godot_variant variant) =>
             VariantUtils.ConvertToDictionaryObject<TKey, TValue>(variant);
 
-        // ReSharper disable StaticMemberInGenericType
-        // Warning is about unique static fields being created for each generic type combination:
-        // https://www.jetbrains.com/help/resharper/StaticMemberInGenericType.html
-        // In our case this is exactly what we want.
-
-        private static readonly unsafe delegate* managed<in TKey, godot_variant> ConvertKeyToVariantCallback;
-        private static readonly unsafe delegate* managed<in godot_variant, TKey> ConvertKeyToManagedCallback;
-        private static readonly unsafe delegate* managed<in TValue, godot_variant> ConvertValueToVariantCallback;
-        private static readonly unsafe delegate* managed<in godot_variant, TValue> ConvertValueToManagedCallback;
-
-        // ReSharper restore StaticMemberInGenericType
-
         static unsafe Dictionary()
         {
-            VariantConversionCallbacks.GenericConversionCallbacks[typeof(Dictionary<TKey, TValue>)] =
-            (
-                (IntPtr)(delegate* managed<in Dictionary<TKey, TValue>, godot_variant>)&ToVariantFunc,
-                (IntPtr)(delegate* managed<in godot_variant, Dictionary<TKey, TValue>>)&FromVariantFunc
-            );
-
-            ConvertKeyToVariantCallback = VariantConversionCallbacks.GetToVariantCallback<TKey>();
-            ConvertKeyToManagedCallback = VariantConversionCallbacks.GetToManagedCallback<TKey>();
-            ConvertValueToVariantCallback = VariantConversionCallbacks.GetToVariantCallback<TValue>();
-            ConvertValueToManagedCallback = VariantConversionCallbacks.GetToManagedCallback<TValue>();
-        }
-
-        private static unsafe void ValidateVariantConversionCallbacks()
-        {
-            if (ConvertKeyToVariantCallback == null || ConvertKeyToManagedCallback == null)
-            {
-                throw new InvalidOperationException(
-                    $"The dictionary key type is not supported for conversion to Variant: '{typeof(TKey).FullName}'.");
-            }
-
-            if (ConvertValueToVariantCallback == null || ConvertValueToManagedCallback == null)
-            {
-                throw new InvalidOperationException(
-                    $"The dictionary value type is not supported for conversion to Variant: '{typeof(TValue).FullName}'.");
-            }
+            VariantUtils.GenericConversion<Dictionary<TKey, TValue>>.ToVariantCb = &ToVariantFunc;
+            VariantUtils.GenericConversion<Dictionary<TKey, TValue>>.FromVariantCb = &FromVariantFunc;
         }
 
         private readonly Dictionary _underlyingDict;
@@ -416,8 +381,6 @@ namespace Godot.Collections
         /// </summary>
         public Dictionary()
         {
-            ValidateVariantConversionCallbacks();
-
             _underlyingDict = new Dictionary();
         }
 
@@ -428,8 +391,6 @@ namespace Godot.Collections
         /// <returns>A new Godot Dictionary.</returns>
         public Dictionary(IDictionary<TKey, TValue> dictionary)
         {
-            ValidateVariantConversionCallbacks();
-
             if (dictionary == null)
                 throw new ArgumentNullException(nameof(dictionary));
 
@@ -446,8 +407,6 @@ namespace Godot.Collections
         /// <returns>A new Godot Dictionary.</returns>
         public Dictionary(Dictionary dictionary)
         {
-            ValidateVariantConversionCallbacks();
-
             _underlyingDict = dictionary;
         }
 
@@ -481,18 +440,18 @@ namespace Godot.Collections
         /// Returns the value at the given <paramref name="key"/>.
         /// </summary>
         /// <value>The value at the given <paramref name="key"/>.</value>
-        public unsafe TValue this[TKey key]
+        public TValue this[TKey key]
         {
             get
             {
-                using var variantKey = ConvertKeyToVariantCallback(key);
+                using var variantKey = VariantUtils.CreateFrom(key);
                 var self = (godot_dictionary)_underlyingDict.NativeValue;
 
                 if (NativeFuncs.godotsharp_dictionary_try_get_value(ref self,
                         variantKey, out godot_variant value).ToBool())
                 {
                     using (value)
-                        return ConvertValueToManagedCallback(value);
+                        return VariantUtils.ConvertTo<TValue>(value);
                 }
                 else
                 {
@@ -501,8 +460,8 @@ namespace Godot.Collections
             }
             set
             {
-                using var variantKey = ConvertKeyToVariantCallback(key);
-                using var variantValue = ConvertValueToVariantCallback(value);
+                using var variantKey = VariantUtils.CreateFrom(key);
+                using var variantValue = VariantUtils.CreateFrom(value);
                 var self = (godot_dictionary)_underlyingDict.NativeValue;
                 NativeFuncs.godotsharp_dictionary_set_value(ref self,
                     variantKey, variantValue);
@@ -541,7 +500,7 @@ namespace Godot.Collections
 
         IEnumerable<TValue> IReadOnlyDictionary<TKey, TValue>.Values => Values;
 
-        private unsafe KeyValuePair<TKey, TValue> GetKeyValuePair(int index)
+        private KeyValuePair<TKey, TValue> GetKeyValuePair(int index)
         {
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             NativeFuncs.godotsharp_dictionary_key_value_pair_at(ref self, index,
@@ -551,8 +510,8 @@ namespace Godot.Collections
             using (value)
             {
                 return new KeyValuePair<TKey, TValue>(
-                    ConvertKeyToManagedCallback(key),
-                    ConvertValueToManagedCallback(value));
+                    VariantUtils.ConvertTo<TKey>(key),
+                    VariantUtils.ConvertTo<TValue>(value));
             }
         }
 
@@ -562,15 +521,15 @@ namespace Godot.Collections
         /// </summary>
         /// <param name="key">The key at which to add the object.</param>
         /// <param name="value">The object to add.</param>
-        public unsafe void Add(TKey key, TValue value)
+        public void Add(TKey key, TValue value)
         {
-            using var variantKey = ConvertKeyToVariantCallback(key);
+            using var variantKey = VariantUtils.CreateFrom(key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
 
             if (NativeFuncs.godotsharp_dictionary_contains_key(ref self, variantKey).ToBool())
                 throw new ArgumentException("An element with the same key already exists.", nameof(key));
 
-            using var variantValue = ConvertValueToVariantCallback(value);
+            using var variantValue = VariantUtils.CreateFrom(value);
             NativeFuncs.godotsharp_dictionary_add(ref self, variantKey, variantValue);
         }
 
@@ -579,9 +538,9 @@ namespace Godot.Collections
         /// </summary>
         /// <param name="key">The key to look for.</param>
         /// <returns>Whether or not this dictionary contains the given key.</returns>
-        public unsafe bool ContainsKey(TKey key)
+        public bool ContainsKey(TKey key)
         {
-            using var variantKey = ConvertKeyToVariantCallback(key);
+            using var variantKey = VariantUtils.CreateFrom(key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             return NativeFuncs.godotsharp_dictionary_contains_key(ref self, variantKey).ToBool();
         }
@@ -590,9 +549,9 @@ namespace Godot.Collections
         /// Removes an element from this <see cref="Dictionary{TKey, TValue}"/> by key.
         /// </summary>
         /// <param name="key">The key of the element to remove.</param>
-        public unsafe bool Remove(TKey key)
+        public bool Remove(TKey key)
         {
-            using var variantKey = ConvertKeyToVariantCallback(key);
+            using var variantKey = VariantUtils.CreateFrom(key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             return NativeFuncs.godotsharp_dictionary_remove_key(ref self, variantKey).ToBool();
         }
@@ -603,15 +562,15 @@ namespace Godot.Collections
         /// <param name="key">The key of the element to get.</param>
         /// <param name="value">The value at the given <paramref name="key"/>.</param>
         /// <returns>If an object was found for the given <paramref name="key"/>.</returns>
-        public unsafe bool TryGetValue(TKey key, [MaybeNullWhen(false)] out TValue value)
+        public bool TryGetValue(TKey key, [MaybeNullWhen(false)] out TValue value)
         {
-            using var variantKey = ConvertKeyToVariantCallback(key);
+            using var variantKey = VariantUtils.CreateFrom(key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             bool found = NativeFuncs.godotsharp_dictionary_try_get_value(ref self,
                 variantKey, out godot_variant retValue).ToBool();
 
             using (retValue)
-                value = found ? ConvertValueToManagedCallback(retValue) : default;
+                value = found ? VariantUtils.ConvertTo<TValue>(retValue) : default;
 
             return found;
         }
@@ -635,9 +594,9 @@ namespace Godot.Collections
         /// </summary>
         public void Clear() => _underlyingDict.Clear();
 
-        unsafe bool ICollection<KeyValuePair<TKey, TValue>>.Contains(KeyValuePair<TKey, TValue> item)
+        bool ICollection<KeyValuePair<TKey, TValue>>.Contains(KeyValuePair<TKey, TValue> item)
         {
-            using var variantKey = ConvertKeyToVariantCallback(item.Key);
+            using var variantKey = VariantUtils.CreateFrom(item.Key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             bool found = NativeFuncs.godotsharp_dictionary_try_get_value(ref self,
                 variantKey, out godot_variant retValue).ToBool();
@@ -647,7 +606,7 @@ namespace Godot.Collections
                 if (!found)
                     return false;
 
-                using var variantValue = ConvertValueToVariantCallback(item.Value);
+                using var variantValue = VariantUtils.CreateFrom(item.Value);
                 return NativeFuncs.godotsharp_variant_equals(variantValue, retValue).ToBool();
             }
         }
@@ -680,9 +639,9 @@ namespace Godot.Collections
             }
         }
 
-        unsafe bool ICollection<KeyValuePair<TKey, TValue>>.Remove(KeyValuePair<TKey, TValue> item)
+        bool ICollection<KeyValuePair<TKey, TValue>>.Remove(KeyValuePair<TKey, TValue> item)
         {
-            using var variantKey = ConvertKeyToVariantCallback(item.Key);
+            using var variantKey = VariantUtils.CreateFrom(item.Key);
             var self = (godot_dictionary)_underlyingDict.NativeValue;
             bool found = NativeFuncs.godotsharp_dictionary_try_get_value(ref self,
                 variantKey, out godot_variant retValue).ToBool();
@@ -692,7 +651,7 @@ namespace Godot.Collections
                 if (!found)
                     return false;
 
-                using var variantValue = ConvertValueToVariantCallback(item.Value);
+                using var variantValue = VariantUtils.CreateFrom(item.Value);
                 if (NativeFuncs.godotsharp_variant_equals(variantValue, retValue).ToBool())
                 {
                     return NativeFuncs.godotsharp_dictionary_remove_key(
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Mathf.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Mathf.cs
index f2667c6807..3f9e986f62 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Mathf.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Mathf.cs
@@ -282,7 +282,7 @@ namespace Godot
 
         /// <summary>
         /// Returns the point at the given <paramref name="t"/> on a one-dimensional Bezier curve defined by
-        /// the given <paramref name="control1"/>, <paramref name="control2"/> and <paramref name="end"/> points.
+        /// the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
         /// </summary>
         /// <param name="start">The start value for the interpolation.</param>
         /// <param name="control1">Control point that defines the bezier curve.</param>
@@ -303,6 +303,27 @@ namespace Godot
         }
 
         /// <summary>
+        /// Returns the derivative at the given <paramref name="t"/> on a one dimensional Bezier curve defined by
+        /// the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
+        /// </summary>
+        /// <param name="start">The start value for the interpolation.</param>
+        /// <param name="control1">Control point that defines the bezier curve.</param>
+        /// <param name="control2">Control point that defines the bezier curve.</param>
+        /// <param name="end">The destination value for the interpolation.</param>
+        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <returns>The resulting value of the interpolation.</returns>
+        public static real_t BezierDerivative(real_t start, real_t control1, real_t control2, real_t end, real_t t)
+        {
+            // Formula from Wikipedia article on Bezier curves
+            real_t omt = 1 - t;
+            real_t omt2 = omt * omt;
+            real_t t2 = t * t;
+
+            real_t d = (control1 - start) * 3 * omt2 + (control2 - control1) * 6 * omt * t + (end - control2) * 3 * t2;
+            return d;
+        }
+
+        /// <summary>
         /// Converts an angle expressed in degrees to radians.
         /// </summary>
         /// <param name="deg">An angle expressed in degrees.</param>
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantConversionCallbacks.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantConversionCallbacks.cs
deleted file mode 100644
index 4b3db0c01a..0000000000
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantConversionCallbacks.cs
+++ /dev/null
@@ -1,1057 +0,0 @@
-using System;
-using System.Diagnostics.CodeAnalysis;
-using System.Runtime.CompilerServices;
-
-namespace Godot.NativeInterop;
-
-// TODO: Change VariantConversionCallbacks<T>. Store the callback in a static field for quick repeated access, instead of checking every time.
-internal static unsafe class VariantConversionCallbacks
-{
-    internal static System.Collections.Generic.Dictionary<Type, (IntPtr ToVariant, IntPtr FromVariant)>
-        GenericConversionCallbacks = new();
-
-    [SuppressMessage("ReSharper", "RedundantNameQualifier")]
-    internal static delegate*<in T, godot_variant> GetToVariantCallback<T>()
-    {
-        static godot_variant FromBool(in bool @bool) =>
-            VariantUtils.CreateFromBool(@bool);
-
-        static godot_variant FromChar(in char @char) =>
-            VariantUtils.CreateFromInt(@char);
-
-        static godot_variant FromInt8(in sbyte @int8) =>
-            VariantUtils.CreateFromInt(@int8);
-
-        static godot_variant FromInt16(in short @int16) =>
-            VariantUtils.CreateFromInt(@int16);
-
-        static godot_variant FromInt32(in int @int32) =>
-            VariantUtils.CreateFromInt(@int32);
-
-        static godot_variant FromInt64(in long @int64) =>
-            VariantUtils.CreateFromInt(@int64);
-
-        static godot_variant FromUInt8(in byte @uint8) =>
-            VariantUtils.CreateFromInt(@uint8);
-
-        static godot_variant FromUInt16(in ushort @uint16) =>
-            VariantUtils.CreateFromInt(@uint16);
-
-        static godot_variant FromUInt32(in uint @uint32) =>
-            VariantUtils.CreateFromInt(@uint32);
-
-        static godot_variant FromUInt64(in ulong @uint64) =>
-            VariantUtils.CreateFromInt(@uint64);
-
-        static godot_variant FromFloat(in float @float) =>
-            VariantUtils.CreateFromFloat(@float);
-
-        static godot_variant FromDouble(in double @double) =>
-            VariantUtils.CreateFromFloat(@double);
-
-        static godot_variant FromVector2(in Vector2 @vector2) =>
-            VariantUtils.CreateFromVector2(@vector2);
-
-        static godot_variant FromVector2I(in Vector2i vector2I) =>
-            VariantUtils.CreateFromVector2i(vector2I);
-
-        static godot_variant FromRect2(in Rect2 @rect2) =>
-            VariantUtils.CreateFromRect2(@rect2);
-
-        static godot_variant FromRect2I(in Rect2i rect2I) =>
-            VariantUtils.CreateFromRect2i(rect2I);
-
-        static godot_variant FromTransform2D(in Transform2D @transform2D) =>
-            VariantUtils.CreateFromTransform2D(@transform2D);
-
-        static godot_variant FromVector3(in Vector3 @vector3) =>
-            VariantUtils.CreateFromVector3(@vector3);
-
-        static godot_variant FromVector3I(in Vector3i vector3I) =>
-            VariantUtils.CreateFromVector3i(vector3I);
-
-        static godot_variant FromBasis(in Basis @basis) =>
-            VariantUtils.CreateFromBasis(@basis);
-
-        static godot_variant FromQuaternion(in Quaternion @quaternion) =>
-            VariantUtils.CreateFromQuaternion(@quaternion);
-
-        static godot_variant FromTransform3D(in Transform3D @transform3d) =>
-            VariantUtils.CreateFromTransform3D(@transform3d);
-
-        static godot_variant FromVector4(in Vector4 @vector4) =>
-            VariantUtils.CreateFromVector4(@vector4);
-
-        static godot_variant FromVector4I(in Vector4i vector4I) =>
-            VariantUtils.CreateFromVector4i(vector4I);
-
-        static godot_variant FromAabb(in AABB @aabb) =>
-            VariantUtils.CreateFromAABB(@aabb);
-
-        static godot_variant FromColor(in Color @color) =>
-            VariantUtils.CreateFromColor(@color);
-
-        static godot_variant FromPlane(in Plane @plane) =>
-            VariantUtils.CreateFromPlane(@plane);
-
-        static godot_variant FromCallable(in Callable @callable) =>
-            VariantUtils.CreateFromCallable(@callable);
-
-        static godot_variant FromSignalInfo(in SignalInfo @signalInfo) =>
-            VariantUtils.CreateFromSignalInfo(@signalInfo);
-
-        static godot_variant FromString(in string @string) =>
-            VariantUtils.CreateFromString(@string);
-
-        static godot_variant FromByteArray(in byte[] byteArray) =>
-            VariantUtils.CreateFromPackedByteArray(byteArray);
-
-        static godot_variant FromInt32Array(in int[] int32Array) =>
-            VariantUtils.CreateFromPackedInt32Array(int32Array);
-
-        static godot_variant FromInt64Array(in long[] int64Array) =>
-            VariantUtils.CreateFromPackedInt64Array(int64Array);
-
-        static godot_variant FromFloatArray(in float[] floatArray) =>
-            VariantUtils.CreateFromPackedFloat32Array(floatArray);
-
-        static godot_variant FromDoubleArray(in double[] doubleArray) =>
-            VariantUtils.CreateFromPackedFloat64Array(doubleArray);
-
-        static godot_variant FromStringArray(in string[] stringArray) =>
-            VariantUtils.CreateFromPackedStringArray(stringArray);
-
-        static godot_variant FromVector2Array(in Vector2[] vector2Array) =>
-            VariantUtils.CreateFromPackedVector2Array(vector2Array);
-
-        static godot_variant FromVector3Array(in Vector3[] vector3Array) =>
-            VariantUtils.CreateFromPackedVector3Array(vector3Array);
-
-        static godot_variant FromColorArray(in Color[] colorArray) =>
-            VariantUtils.CreateFromPackedColorArray(colorArray);
-
-        static godot_variant FromStringNameArray(in StringName[] stringNameArray) =>
-            VariantUtils.CreateFromSystemArrayOfStringName(stringNameArray);
-
-        static godot_variant FromNodePathArray(in NodePath[] nodePathArray) =>
-            VariantUtils.CreateFromSystemArrayOfNodePath(nodePathArray);
-
-        static godot_variant FromRidArray(in RID[] ridArray) =>
-            VariantUtils.CreateFromSystemArrayOfRID(ridArray);
-
-        static godot_variant FromGodotObject(in Godot.Object godotObject) =>
-            VariantUtils.CreateFromGodotObject(godotObject);
-
-        static godot_variant FromStringName(in StringName stringName) =>
-            VariantUtils.CreateFromStringName(stringName);
-
-        static godot_variant FromNodePath(in NodePath nodePath) =>
-            VariantUtils.CreateFromNodePath(nodePath);
-
-        static godot_variant FromRid(in RID rid) =>
-            VariantUtils.CreateFromRID(rid);
-
-        static godot_variant FromGodotDictionary(in Collections.Dictionary godotDictionary) =>
-            VariantUtils.CreateFromDictionary(godotDictionary);
-
-        static godot_variant FromGodotArray(in Collections.Array godotArray) =>
-            VariantUtils.CreateFromArray(godotArray);
-
-        static godot_variant FromVariant(in Variant variant) =>
-            NativeFuncs.godotsharp_variant_new_copy((godot_variant)variant.NativeVar);
-
-        var typeOfT = typeof(T);
-
-        if (typeOfT == typeof(bool))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in bool, godot_variant>)
-                &FromBool;
-        }
-
-        if (typeOfT == typeof(char))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in char, godot_variant>)
-                &FromChar;
-        }
-
-        if (typeOfT == typeof(sbyte))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in sbyte, godot_variant>)
-                &FromInt8;
-        }
-
-        if (typeOfT == typeof(short))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in short, godot_variant>)
-                &FromInt16;
-        }
-
-        if (typeOfT == typeof(int))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in int, godot_variant>)
-                &FromInt32;
-        }
-
-        if (typeOfT == typeof(long))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in long, godot_variant>)
-                &FromInt64;
-        }
-
-        if (typeOfT == typeof(byte))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in byte, godot_variant>)
-                &FromUInt8;
-        }
-
-        if (typeOfT == typeof(ushort))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in ushort, godot_variant>)
-                &FromUInt16;
-        }
-
-        if (typeOfT == typeof(uint))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in uint, godot_variant>)
-                &FromUInt32;
-        }
-
-        if (typeOfT == typeof(ulong))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in ulong, godot_variant>)
-                &FromUInt64;
-        }
-
-        if (typeOfT == typeof(float))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in float, godot_variant>)
-                &FromFloat;
-        }
-
-        if (typeOfT == typeof(double))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in double, godot_variant>)
-                &FromDouble;
-        }
-
-        if (typeOfT == typeof(Vector2))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector2, godot_variant>)
-                &FromVector2;
-        }
-
-        if (typeOfT == typeof(Vector2i))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector2i, godot_variant>)
-                &FromVector2I;
-        }
-
-        if (typeOfT == typeof(Rect2))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Rect2, godot_variant>)
-                &FromRect2;
-        }
-
-        if (typeOfT == typeof(Rect2i))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Rect2i, godot_variant>)
-                &FromRect2I;
-        }
-
-        if (typeOfT == typeof(Transform2D))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Transform2D, godot_variant>)
-                &FromTransform2D;
-        }
-
-        if (typeOfT == typeof(Vector3))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector3, godot_variant>)
-                &FromVector3;
-        }
-
-        if (typeOfT == typeof(Vector3i))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector3i, godot_variant>)
-                &FromVector3I;
-        }
-
-        if (typeOfT == typeof(Basis))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Basis, godot_variant>)
-                &FromBasis;
-        }
-
-        if (typeOfT == typeof(Quaternion))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Quaternion, godot_variant>)
-                &FromQuaternion;
-        }
-
-        if (typeOfT == typeof(Transform3D))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Transform3D, godot_variant>)
-                &FromTransform3D;
-        }
-
-        if (typeOfT == typeof(Vector4))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector4, godot_variant>)
-                &FromVector4;
-        }
-
-        if (typeOfT == typeof(Vector4i))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector4i, godot_variant>)
-                &FromVector4I;
-        }
-
-        if (typeOfT == typeof(AABB))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in AABB, godot_variant>)
-                &FromAabb;
-        }
-
-        if (typeOfT == typeof(Color))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Color, godot_variant>)
-                &FromColor;
-        }
-
-        if (typeOfT == typeof(Plane))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Plane, godot_variant>)
-                &FromPlane;
-        }
-
-        if (typeOfT == typeof(Callable))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Callable, godot_variant>)
-                &FromCallable;
-        }
-
-        if (typeOfT == typeof(SignalInfo))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in SignalInfo, godot_variant>)
-                &FromSignalInfo;
-        }
-
-        if (typeOfT.IsEnum)
-        {
-            var enumUnderlyingType = typeOfT.GetEnumUnderlyingType();
-
-            switch (Type.GetTypeCode(enumUnderlyingType))
-            {
-                case TypeCode.SByte:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in sbyte, godot_variant>)
-                        &FromInt8;
-                }
-                case TypeCode.Int16:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in short, godot_variant>)
-                        &FromInt16;
-                }
-                case TypeCode.Int32:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in int, godot_variant>)
-                        &FromInt32;
-                }
-                case TypeCode.Int64:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in long, godot_variant>)
-                        &FromInt64;
-                }
-                case TypeCode.Byte:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in byte, godot_variant>)
-                        &FromUInt8;
-                }
-                case TypeCode.UInt16:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in ushort, godot_variant>)
-                        &FromUInt16;
-                }
-                case TypeCode.UInt32:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in uint, godot_variant>)
-                        &FromUInt32;
-                }
-                case TypeCode.UInt64:
-                {
-                    return (delegate*<in T, godot_variant>)(delegate*<in ulong, godot_variant>)
-                        &FromUInt64;
-                }
-                default:
-                    return null;
-            }
-        }
-
-        if (typeOfT == typeof(string))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in string, godot_variant>)
-                &FromString;
-        }
-
-        if (typeOfT == typeof(byte[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in byte[], godot_variant>)
-                &FromByteArray;
-        }
-
-        if (typeOfT == typeof(int[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in int[], godot_variant>)
-                &FromInt32Array;
-        }
-
-        if (typeOfT == typeof(long[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in long[], godot_variant>)
-                &FromInt64Array;
-        }
-
-        if (typeOfT == typeof(float[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in float[], godot_variant>)
-                &FromFloatArray;
-        }
-
-        if (typeOfT == typeof(double[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in double[], godot_variant>)
-                &FromDoubleArray;
-        }
-
-        if (typeOfT == typeof(string[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in string[], godot_variant>)
-                &FromStringArray;
-        }
-
-        if (typeOfT == typeof(Vector2[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector2[], godot_variant>)
-                &FromVector2Array;
-        }
-
-        if (typeOfT == typeof(Vector3[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Vector3[], godot_variant>)
-                &FromVector3Array;
-        }
-
-        if (typeOfT == typeof(Color[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Color[], godot_variant>)
-                &FromColorArray;
-        }
-
-        if (typeOfT == typeof(StringName[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in StringName[], godot_variant>)
-                &FromStringNameArray;
-        }
-
-        if (typeOfT == typeof(NodePath[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in NodePath[], godot_variant>)
-                &FromNodePathArray;
-        }
-
-        if (typeOfT == typeof(RID[]))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in RID[], godot_variant>)
-                &FromRidArray;
-        }
-
-        if (typeof(Godot.Object).IsAssignableFrom(typeOfT))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Godot.Object, godot_variant>)
-                &FromGodotObject;
-        }
-
-        if (typeOfT == typeof(StringName))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in StringName, godot_variant>)
-                &FromStringName;
-        }
-
-        if (typeOfT == typeof(NodePath))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in NodePath, godot_variant>)
-                &FromNodePath;
-        }
-
-        if (typeOfT == typeof(RID))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in RID, godot_variant>)
-                &FromRid;
-        }
-
-        if (typeOfT == typeof(Godot.Collections.Dictionary))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Godot.Collections.Dictionary, godot_variant>)
-                &FromGodotDictionary;
-        }
-
-        if (typeOfT == typeof(Godot.Collections.Array))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Godot.Collections.Array, godot_variant>)
-                &FromGodotArray;
-        }
-
-        if (typeOfT == typeof(Variant))
-        {
-            return (delegate*<in T, godot_variant>)(delegate*<in Variant, godot_variant>)
-                &FromVariant;
-        }
-
-        // TODO:
-        //   IsGenericType and GetGenericTypeDefinition don't work in NativeAOT's reflection-free mode.
-        //   We could make the Godot collections implement an interface and use IsAssignableFrom instead.
-        //   Or we could just skip the check and always look for a conversion callback for the type.
-        if (typeOfT.IsGenericType)
-        {
-            var genericTypeDef = typeOfT.GetGenericTypeDefinition();
-
-            if (genericTypeDef == typeof(Godot.Collections.Dictionary<,>) ||
-                genericTypeDef == typeof(Godot.Collections.Array<>))
-            {
-                RuntimeHelpers.RunClassConstructor(typeOfT.TypeHandle);
-
-                if (GenericConversionCallbacks.TryGetValue(typeOfT, out var genericConversion))
-                {
-                    return (delegate*<in T, godot_variant>)genericConversion.ToVariant;
-                }
-            }
-        }
-
-        return null;
-    }
-
-    [SuppressMessage("ReSharper", "RedundantNameQualifier")]
-    internal static delegate*<in godot_variant, T> GetToManagedCallback<T>()
-    {
-        static bool ToBool(in godot_variant variant) =>
-            VariantUtils.ConvertToBool(variant);
-
-        static char ToChar(in godot_variant variant) =>
-            VariantUtils.ConvertToChar(variant);
-
-        static sbyte ToInt8(in godot_variant variant) =>
-            VariantUtils.ConvertToInt8(variant);
-
-        static short ToInt16(in godot_variant variant) =>
-            VariantUtils.ConvertToInt16(variant);
-
-        static int ToInt32(in godot_variant variant) =>
-            VariantUtils.ConvertToInt32(variant);
-
-        static long ToInt64(in godot_variant variant) =>
-            VariantUtils.ConvertToInt64(variant);
-
-        static byte ToUInt8(in godot_variant variant) =>
-            VariantUtils.ConvertToUInt8(variant);
-
-        static ushort ToUInt16(in godot_variant variant) =>
-            VariantUtils.ConvertToUInt16(variant);
-
-        static uint ToUInt32(in godot_variant variant) =>
-            VariantUtils.ConvertToUInt32(variant);
-
-        static ulong ToUInt64(in godot_variant variant) =>
-            VariantUtils.ConvertToUInt64(variant);
-
-        static float ToFloat(in godot_variant variant) =>
-            VariantUtils.ConvertToFloat32(variant);
-
-        static double ToDouble(in godot_variant variant) =>
-            VariantUtils.ConvertToFloat64(variant);
-
-        static Vector2 ToVector2(in godot_variant variant) =>
-            VariantUtils.ConvertToVector2(variant);
-
-        static Vector2i ToVector2I(in godot_variant variant) =>
-            VariantUtils.ConvertToVector2i(variant);
-
-        static Rect2 ToRect2(in godot_variant variant) =>
-            VariantUtils.ConvertToRect2(variant);
-
-        static Rect2i ToRect2I(in godot_variant variant) =>
-            VariantUtils.ConvertToRect2i(variant);
-
-        static Transform2D ToTransform2D(in godot_variant variant) =>
-            VariantUtils.ConvertToTransform2D(variant);
-
-        static Vector3 ToVector3(in godot_variant variant) =>
-            VariantUtils.ConvertToVector3(variant);
-
-        static Vector3i ToVector3I(in godot_variant variant) =>
-            VariantUtils.ConvertToVector3i(variant);
-
-        static Basis ToBasis(in godot_variant variant) =>
-            VariantUtils.ConvertToBasis(variant);
-
-        static Quaternion ToQuaternion(in godot_variant variant) =>
-            VariantUtils.ConvertToQuaternion(variant);
-
-        static Transform3D ToTransform3D(in godot_variant variant) =>
-            VariantUtils.ConvertToTransform3D(variant);
-
-        static Vector4 ToVector4(in godot_variant variant) =>
-            VariantUtils.ConvertToVector4(variant);
-
-        static Vector4i ToVector4I(in godot_variant variant) =>
-            VariantUtils.ConvertToVector4i(variant);
-
-        static AABB ToAabb(in godot_variant variant) =>
-            VariantUtils.ConvertToAABB(variant);
-
-        static Color ToColor(in godot_variant variant) =>
-            VariantUtils.ConvertToColor(variant);
-
-        static Plane ToPlane(in godot_variant variant) =>
-            VariantUtils.ConvertToPlane(variant);
-
-        static Callable ToCallable(in godot_variant variant) =>
-            VariantUtils.ConvertToCallableManaged(variant);
-
-        static SignalInfo ToSignalInfo(in godot_variant variant) =>
-            VariantUtils.ConvertToSignalInfo(variant);
-
-        static string ToString(in godot_variant variant) =>
-            VariantUtils.ConvertToStringObject(variant);
-
-        static byte[] ToByteArray(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedByteArrayToSystemArray(variant);
-
-        static int[] ToInt32Array(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedInt32ArrayToSystemArray(variant);
-
-        static long[] ToInt64Array(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedInt64ArrayToSystemArray(variant);
-
-        static float[] ToFloatArray(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedFloat32ArrayToSystemArray(variant);
-
-        static double[] ToDoubleArray(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedFloat64ArrayToSystemArray(variant);
-
-        static string[] ToStringArray(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedStringArrayToSystemArray(variant);
-
-        static Vector2[] ToVector2Array(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedVector2ArrayToSystemArray(variant);
-
-        static Vector3[] ToVector3Array(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedVector3ArrayToSystemArray(variant);
-
-        static Color[] ToColorArray(in godot_variant variant) =>
-            VariantUtils.ConvertAsPackedColorArrayToSystemArray(variant);
-
-        static StringName[] ToStringNameArray(in godot_variant variant) =>
-            VariantUtils.ConvertToSystemArrayOfStringName(variant);
-
-        static NodePath[] ToNodePathArray(in godot_variant variant) =>
-            VariantUtils.ConvertToSystemArrayOfNodePath(variant);
-
-        static RID[] ToRidArray(in godot_variant variant) =>
-            VariantUtils.ConvertToSystemArrayOfRID(variant);
-
-        static Godot.Object ToGodotObject(in godot_variant variant) =>
-            VariantUtils.ConvertToGodotObject(variant);
-
-        static StringName ToStringName(in godot_variant variant) =>
-            VariantUtils.ConvertToStringNameObject(variant);
-
-        static NodePath ToNodePath(in godot_variant variant) =>
-            VariantUtils.ConvertToNodePathObject(variant);
-
-        static RID ToRid(in godot_variant variant) =>
-            VariantUtils.ConvertToRID(variant);
-
-        static Collections.Dictionary ToGodotDictionary(in godot_variant variant) =>
-            VariantUtils.ConvertToDictionaryObject(variant);
-
-        static Collections.Array ToGodotArray(in godot_variant variant) =>
-            VariantUtils.ConvertToArrayObject(variant);
-
-        static Variant ToVariant(in godot_variant variant) =>
-            Variant.CreateCopyingBorrowed(variant);
-
-        var typeOfT = typeof(T);
-
-        // ReSharper disable RedundantCast
-        // Rider is being stupid here. These casts are definitely needed. We get build errors without them.
-
-        if (typeOfT == typeof(bool))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, bool>)
-                &ToBool;
-        }
-
-        if (typeOfT == typeof(char))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, char>)
-                &ToChar;
-        }
-
-        if (typeOfT == typeof(sbyte))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, sbyte>)
-                &ToInt8;
-        }
-
-        if (typeOfT == typeof(short))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, short>)
-                &ToInt16;
-        }
-
-        if (typeOfT == typeof(int))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, int>)
-                &ToInt32;
-        }
-
-        if (typeOfT == typeof(long))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, long>)
-                &ToInt64;
-        }
-
-        if (typeOfT == typeof(byte))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, byte>)
-                &ToUInt8;
-        }
-
-        if (typeOfT == typeof(ushort))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, ushort>)
-                &ToUInt16;
-        }
-
-        if (typeOfT == typeof(uint))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, uint>)
-                &ToUInt32;
-        }
-
-        if (typeOfT == typeof(ulong))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, ulong>)
-                &ToUInt64;
-        }
-
-        if (typeOfT == typeof(float))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, float>)
-                &ToFloat;
-        }
-
-        if (typeOfT == typeof(double))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, double>)
-                &ToDouble;
-        }
-
-        if (typeOfT == typeof(Vector2))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector2>)
-                &ToVector2;
-        }
-
-        if (typeOfT == typeof(Vector2i))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector2i>)
-                &ToVector2I;
-        }
-
-        if (typeOfT == typeof(Rect2))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Rect2>)
-                &ToRect2;
-        }
-
-        if (typeOfT == typeof(Rect2i))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Rect2i>)
-                &ToRect2I;
-        }
-
-        if (typeOfT == typeof(Transform2D))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Transform2D>)
-                &ToTransform2D;
-        }
-
-        if (typeOfT == typeof(Vector3))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector3>)
-                &ToVector3;
-        }
-
-        if (typeOfT == typeof(Vector3i))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector3i>)
-                &ToVector3I;
-        }
-
-        if (typeOfT == typeof(Basis))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Basis>)
-                &ToBasis;
-        }
-
-        if (typeOfT == typeof(Quaternion))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Quaternion>)
-                &ToQuaternion;
-        }
-
-        if (typeOfT == typeof(Transform3D))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Transform3D>)
-                &ToTransform3D;
-        }
-
-        if (typeOfT == typeof(Vector4))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector4>)
-                &ToVector4;
-        }
-
-        if (typeOfT == typeof(Vector4i))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector4i>)
-                &ToVector4I;
-        }
-
-        if (typeOfT == typeof(AABB))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, AABB>)
-                &ToAabb;
-        }
-
-        if (typeOfT == typeof(Color))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Color>)
-                &ToColor;
-        }
-
-        if (typeOfT == typeof(Plane))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Plane>)
-                &ToPlane;
-        }
-
-        if (typeOfT == typeof(Callable))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Callable>)
-                &ToCallable;
-        }
-
-        if (typeOfT == typeof(SignalInfo))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, SignalInfo>)
-                &ToSignalInfo;
-        }
-
-        if (typeOfT.IsEnum)
-        {
-            var enumUnderlyingType = typeOfT.GetEnumUnderlyingType();
-
-            switch (Type.GetTypeCode(enumUnderlyingType))
-            {
-                case TypeCode.SByte:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, sbyte>)
-                        &ToInt8;
-                }
-                case TypeCode.Int16:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, short>)
-                        &ToInt16;
-                }
-                case TypeCode.Int32:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, int>)
-                        &ToInt32;
-                }
-                case TypeCode.Int64:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, long>)
-                        &ToInt64;
-                }
-                case TypeCode.Byte:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, byte>)
-                        &ToUInt8;
-                }
-                case TypeCode.UInt16:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, ushort>)
-                        &ToUInt16;
-                }
-                case TypeCode.UInt32:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, uint>)
-                        &ToUInt32;
-                }
-                case TypeCode.UInt64:
-                {
-                    return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, ulong>)
-                        &ToUInt64;
-                }
-                default:
-                    return null;
-            }
-        }
-
-        if (typeOfT == typeof(string))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, string>)
-                &ToString;
-        }
-
-        if (typeOfT == typeof(byte[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, byte[]>)
-                &ToByteArray;
-        }
-
-        if (typeOfT == typeof(int[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, int[]>)
-                &ToInt32Array;
-        }
-
-        if (typeOfT == typeof(long[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, long[]>)
-                &ToInt64Array;
-        }
-
-        if (typeOfT == typeof(float[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, float[]>)
-                &ToFloatArray;
-        }
-
-        if (typeOfT == typeof(double[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, double[]>)
-                &ToDoubleArray;
-        }
-
-        if (typeOfT == typeof(string[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, string[]>)
-                &ToStringArray;
-        }
-
-        if (typeOfT == typeof(Vector2[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector2[]>)
-                &ToVector2Array;
-        }
-
-        if (typeOfT == typeof(Vector3[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Vector3[]>)
-                &ToVector3Array;
-        }
-
-        if (typeOfT == typeof(Color[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Color[]>)
-                &ToColorArray;
-        }
-
-        if (typeOfT == typeof(StringName[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, StringName[]>)
-                &ToStringNameArray;
-        }
-
-        if (typeOfT == typeof(NodePath[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, NodePath[]>)
-                &ToNodePathArray;
-        }
-
-        if (typeOfT == typeof(RID[]))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, RID[]>)
-                &ToRidArray;
-        }
-
-        if (typeof(Godot.Object).IsAssignableFrom(typeOfT))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Godot.Object>)
-                &ToGodotObject;
-        }
-
-        if (typeOfT == typeof(StringName))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, StringName>)
-                &ToStringName;
-        }
-
-        if (typeOfT == typeof(NodePath))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, NodePath>)
-                &ToNodePath;
-        }
-
-        if (typeOfT == typeof(RID))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, RID>)
-                &ToRid;
-        }
-
-        if (typeOfT == typeof(Godot.Collections.Dictionary))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Godot.Collections.Dictionary>)
-                &ToGodotDictionary;
-        }
-
-        if (typeOfT == typeof(Godot.Collections.Array))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Godot.Collections.Array>)
-                &ToGodotArray;
-        }
-
-        if (typeOfT == typeof(Variant))
-        {
-            return (delegate*<in godot_variant, T>)(delegate*<in godot_variant, Variant>)
-                &ToVariant;
-        }
-
-        // TODO:
-        //   IsGenericType and GetGenericTypeDefinition don't work in NativeAOT's reflection-free mode.
-        //   We could make the Godot collections implement an interface and use IsAssignableFrom instead.
-        //   Or we could just skip the check and always look for a conversion callback for the type.
-        if (typeOfT.IsGenericType)
-        {
-            var genericTypeDef = typeOfT.GetGenericTypeDefinition();
-
-            if (genericTypeDef == typeof(Godot.Collections.Dictionary<,>) ||
-                genericTypeDef == typeof(Godot.Collections.Array<>))
-            {
-                RuntimeHelpers.RunClassConstructor(typeOfT.TypeHandle);
-
-                if (GenericConversionCallbacks.TryGetValue(typeOfT, out var genericConversion))
-                {
-                    return (delegate*<in godot_variant, T>)genericConversion.FromVariant;
-                }
-            }
-        }
-
-        // ReSharper restore RedundantCast
-
-        return null;
-    }
-}
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.cs
index 57f9ec7d95..ba8e7a6c65 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.cs
@@ -8,7 +8,7 @@ using Godot.Collections;
 
 namespace Godot.NativeInterop
 {
-    public static class VariantUtils
+    public static partial class VariantUtils
     {
         public static godot_variant CreateFromRID(RID from)
             => new() { Type = Variant.Type.Rid, RID = from };
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.generic.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.generic.cs
new file mode 100644
index 0000000000..db84175e17
--- /dev/null
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/NativeInterop/VariantUtils.generic.cs
@@ -0,0 +1,406 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+
+namespace Godot.NativeInterop;
+
+public partial class VariantUtils
+{
+    private static Exception UnsupportedType<T>() => throw new InvalidOperationException(
+        $"The type is not supported for conversion to/from Variant: '{typeof(T).FullName}'");
+
+    internal static class GenericConversion<T>
+    {
+        public static unsafe godot_variant ToVariant(in T from) =>
+            ToVariantCb != null ? ToVariantCb(from) : throw UnsupportedType<T>();
+
+        public static unsafe T FromVariant(in godot_variant variant) =>
+            FromVariantCb != null ? FromVariantCb(variant) : throw UnsupportedType<T>();
+
+        // ReSharper disable once StaticMemberInGenericType
+        internal static unsafe delegate*<in T, godot_variant> ToVariantCb;
+
+        // ReSharper disable once StaticMemberInGenericType
+        internal static unsafe delegate*<in godot_variant, T> FromVariantCb;
+
+        [SuppressMessage("ReSharper", "RedundantNameQualifier")]
+        static GenericConversion()
+        {
+            RuntimeHelpers.RunClassConstructor(typeof(T).TypeHandle);
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+    [SuppressMessage("ReSharper", "RedundantNameQualifier")]
+    public static godot_variant CreateFrom<[MustBeVariant] T>(in T from)
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static TTo UnsafeAs<TTo>(in T f) => Unsafe.As<T, TTo>(ref Unsafe.AsRef(f));
+
+        // `typeof(T) == typeof(X)` is optimized away. We cannot cache `typeof(T)` in a local variable, as it's not optimized when done like that.
+
+        if (typeof(T) == typeof(bool))
+            return CreateFromBool(UnsafeAs<bool>(from));
+
+        if (typeof(T) == typeof(char))
+            return CreateFromInt(UnsafeAs<char>(from));
+
+        if (typeof(T) == typeof(sbyte))
+            return CreateFromInt(UnsafeAs<sbyte>(from));
+
+        if (typeof(T) == typeof(short))
+            return CreateFromInt(UnsafeAs<short>(from));
+
+        if (typeof(T) == typeof(int))
+            return CreateFromInt(UnsafeAs<int>(from));
+
+        if (typeof(T) == typeof(long))
+            return CreateFromInt(UnsafeAs<long>(from));
+
+        if (typeof(T) == typeof(byte))
+            return CreateFromInt(UnsafeAs<byte>(from));
+
+        if (typeof(T) == typeof(ushort))
+            return CreateFromInt(UnsafeAs<ushort>(from));
+
+        if (typeof(T) == typeof(uint))
+            return CreateFromInt(UnsafeAs<uint>(from));
+
+        if (typeof(T) == typeof(ulong))
+            return CreateFromInt(UnsafeAs<ulong>(from));
+
+        if (typeof(T) == typeof(float))
+            return CreateFromFloat(UnsafeAs<float>(from));
+
+        if (typeof(T) == typeof(double))
+            return CreateFromFloat(UnsafeAs<double>(from));
+
+        if (typeof(T) == typeof(Vector2))
+            return CreateFromVector2(UnsafeAs<Vector2>(from));
+
+        if (typeof(T) == typeof(Vector2i))
+            return CreateFromVector2i(UnsafeAs<Vector2i>(from));
+
+        if (typeof(T) == typeof(Rect2))
+            return CreateFromRect2(UnsafeAs<Rect2>(from));
+
+        if (typeof(T) == typeof(Rect2i))
+            return CreateFromRect2i(UnsafeAs<Rect2i>(from));
+
+        if (typeof(T) == typeof(Transform2D))
+            return CreateFromTransform2D(UnsafeAs<Transform2D>(from));
+
+        if (typeof(T) == typeof(Vector3))
+            return CreateFromVector3(UnsafeAs<Vector3>(from));
+
+        if (typeof(T) == typeof(Vector3i))
+            return CreateFromVector3i(UnsafeAs<Vector3i>(from));
+
+        if (typeof(T) == typeof(Basis))
+            return CreateFromBasis(UnsafeAs<Basis>(from));
+
+        if (typeof(T) == typeof(Quaternion))
+            return CreateFromQuaternion(UnsafeAs<Quaternion>(from));
+
+        if (typeof(T) == typeof(Transform3D))
+            return CreateFromTransform3D(UnsafeAs<Transform3D>(from));
+
+        if (typeof(T) == typeof(Vector4))
+            return CreateFromVector4(UnsafeAs<Vector4>(from));
+
+        if (typeof(T) == typeof(Vector4i))
+            return CreateFromVector4i(UnsafeAs<Vector4i>(from));
+
+        if (typeof(T) == typeof(AABB))
+            return CreateFromAABB(UnsafeAs<AABB>(from));
+
+        if (typeof(T) == typeof(Color))
+            return CreateFromColor(UnsafeAs<Color>(from));
+
+        if (typeof(T) == typeof(Plane))
+            return CreateFromPlane(UnsafeAs<Plane>(from));
+
+        if (typeof(T) == typeof(Callable))
+            return CreateFromCallable(UnsafeAs<Callable>(from));
+
+        if (typeof(T) == typeof(SignalInfo))
+            return CreateFromSignalInfo(UnsafeAs<SignalInfo>(from));
+
+        if (typeof(T) == typeof(string))
+            return CreateFromString(UnsafeAs<string>(from));
+
+        if (typeof(T) == typeof(byte[]))
+            return CreateFromPackedByteArray(UnsafeAs<byte[]>(from));
+
+        if (typeof(T) == typeof(int[]))
+            return CreateFromPackedInt32Array(UnsafeAs<int[]>(from));
+
+        if (typeof(T) == typeof(long[]))
+            return CreateFromPackedInt64Array(UnsafeAs<long[]>(from));
+
+        if (typeof(T) == typeof(float[]))
+            return CreateFromPackedFloat32Array(UnsafeAs<float[]>(from));
+
+        if (typeof(T) == typeof(double[]))
+            return CreateFromPackedFloat64Array(UnsafeAs<double[]>(from));
+
+        if (typeof(T) == typeof(string[]))
+            return CreateFromPackedStringArray(UnsafeAs<string[]>(from));
+
+        if (typeof(T) == typeof(Vector2[]))
+            return CreateFromPackedVector2Array(UnsafeAs<Vector2[]>(from));
+
+        if (typeof(T) == typeof(Vector3[]))
+            return CreateFromPackedVector3Array(UnsafeAs<Vector3[]>(from));
+
+        if (typeof(T) == typeof(Color[]))
+            return CreateFromPackedColorArray(UnsafeAs<Color[]>(from));
+
+        if (typeof(T) == typeof(StringName[]))
+            return CreateFromSystemArrayOfStringName(UnsafeAs<StringName[]>(from));
+
+        if (typeof(T) == typeof(NodePath[]))
+            return CreateFromSystemArrayOfNodePath(UnsafeAs<NodePath[]>(from));
+
+        if (typeof(T) == typeof(RID[]))
+            return CreateFromSystemArrayOfRID(UnsafeAs<RID[]>(from));
+
+        if (typeof(T) == typeof(StringName))
+            return CreateFromStringName(UnsafeAs<StringName>(from));
+
+        if (typeof(T) == typeof(NodePath))
+            return CreateFromNodePath(UnsafeAs<NodePath>(from));
+
+        if (typeof(T) == typeof(RID))
+            return CreateFromRID(UnsafeAs<RID>(from));
+
+        if (typeof(T) == typeof(Godot.Collections.Dictionary))
+            return CreateFromDictionary(UnsafeAs<Godot.Collections.Dictionary>(from));
+
+        if (typeof(T) == typeof(Godot.Collections.Array))
+            return CreateFromArray(UnsafeAs<Godot.Collections.Array>(from));
+
+        if (typeof(T) == typeof(Variant))
+            return NativeFuncs.godotsharp_variant_new_copy((godot_variant)UnsafeAs<Variant>(from).NativeVar);
+
+        // More complex checks here at the end, to avoid screwing the simple ones in case they're not optimized away.
+
+        // `typeof(X).IsAssignableFrom(typeof(T))` is optimized away
+
+        if (typeof(Godot.Object).IsAssignableFrom(typeof(T)))
+            return CreateFromGodotObject(UnsafeAs<Godot.Object>(from));
+
+        // `typeof(T).IsValueType` is optimized away
+        // `typeof(T).IsEnum` is NOT optimized away: https://github.com/dotnet/runtime/issues/67113
+        // Fortunately, `typeof(System.Enum).IsAssignableFrom(typeof(T))` does the job!
+
+        if (typeof(T).IsValueType && typeof(System.Enum).IsAssignableFrom(typeof(T)))
+        {
+            // `Type.GetTypeCode(typeof(T).GetEnumUnderlyingType())` is not optimized away.
+            // Fortunately, `Unsafe.SizeOf<T>()` works and is optimized away.
+            // We don't need to know whether it's signed or unsigned.
+
+            if (Unsafe.SizeOf<T>() == 1)
+                return CreateFromInt(UnsafeAs<sbyte>(from));
+
+            if (Unsafe.SizeOf<T>() == 2)
+                return CreateFromInt(UnsafeAs<short>(from));
+
+            if (Unsafe.SizeOf<T>() == 4)
+                return CreateFromInt(UnsafeAs<int>(from));
+
+            if (Unsafe.SizeOf<T>() == 8)
+                return CreateFromInt(UnsafeAs<long>(from));
+
+            throw UnsupportedType<T>();
+        }
+
+        return GenericConversion<T>.ToVariant(from);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+    [SuppressMessage("ReSharper", "RedundantNameQualifier")]
+    public static T ConvertTo<[MustBeVariant] T>(in godot_variant variant)
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static T UnsafeAsT<TFrom>(TFrom f) => Unsafe.As<TFrom, T>(ref Unsafe.AsRef(f));
+
+        if (typeof(T) == typeof(bool))
+            return UnsafeAsT(ConvertToBool(variant));
+
+        if (typeof(T) == typeof(char))
+            return UnsafeAsT(ConvertToChar(variant));
+
+        if (typeof(T) == typeof(sbyte))
+            return UnsafeAsT(ConvertToInt8(variant));
+
+        if (typeof(T) == typeof(short))
+            return UnsafeAsT(ConvertToInt16(variant));
+
+        if (typeof(T) == typeof(int))
+            return UnsafeAsT(ConvertToInt32(variant));
+
+        if (typeof(T) == typeof(long))
+            return UnsafeAsT(ConvertToInt64(variant));
+
+        if (typeof(T) == typeof(byte))
+            return UnsafeAsT(ConvertToUInt8(variant));
+
+        if (typeof(T) == typeof(ushort))
+            return UnsafeAsT(ConvertToUInt16(variant));
+
+        if (typeof(T) == typeof(uint))
+            return UnsafeAsT(ConvertToUInt32(variant));
+
+        if (typeof(T) == typeof(ulong))
+            return UnsafeAsT(ConvertToUInt64(variant));
+
+        if (typeof(T) == typeof(float))
+            return UnsafeAsT(ConvertToFloat32(variant));
+
+        if (typeof(T) == typeof(double))
+            return UnsafeAsT(ConvertToFloat64(variant));
+
+        if (typeof(T) == typeof(Vector2))
+            return UnsafeAsT(ConvertToVector2(variant));
+
+        if (typeof(T) == typeof(Vector2i))
+            return UnsafeAsT(ConvertToVector2i(variant));
+
+        if (typeof(T) == typeof(Rect2))
+            return UnsafeAsT(ConvertToRect2(variant));
+
+        if (typeof(T) == typeof(Rect2i))
+            return UnsafeAsT(ConvertToRect2i(variant));
+
+        if (typeof(T) == typeof(Transform2D))
+            return UnsafeAsT(ConvertToTransform2D(variant));
+
+        if (typeof(T) == typeof(Vector3))
+            return UnsafeAsT(ConvertToVector3(variant));
+
+        if (typeof(T) == typeof(Vector3i))
+            return UnsafeAsT(ConvertToVector3i(variant));
+
+        if (typeof(T) == typeof(Basis))
+            return UnsafeAsT(ConvertToBasis(variant));
+
+        if (typeof(T) == typeof(Quaternion))
+            return UnsafeAsT(ConvertToQuaternion(variant));
+
+        if (typeof(T) == typeof(Transform3D))
+            return UnsafeAsT(ConvertToTransform3D(variant));
+
+        if (typeof(T) == typeof(Vector4))
+            return UnsafeAsT(ConvertToVector4(variant));
+
+        if (typeof(T) == typeof(Vector4i))
+            return UnsafeAsT(ConvertToVector4i(variant));
+
+        if (typeof(T) == typeof(AABB))
+            return UnsafeAsT(ConvertToAABB(variant));
+
+        if (typeof(T) == typeof(Color))
+            return UnsafeAsT(ConvertToColor(variant));
+
+        if (typeof(T) == typeof(Plane))
+            return UnsafeAsT(ConvertToPlane(variant));
+
+        if (typeof(T) == typeof(Callable))
+            return UnsafeAsT(ConvertToCallableManaged(variant));
+
+        if (typeof(T) == typeof(SignalInfo))
+            return UnsafeAsT(ConvertToSignalInfo(variant));
+
+        if (typeof(T) == typeof(string))
+            return UnsafeAsT(ConvertToStringObject(variant));
+
+        if (typeof(T) == typeof(byte[]))
+            return UnsafeAsT(ConvertAsPackedByteArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(int[]))
+            return UnsafeAsT(ConvertAsPackedInt32ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(long[]))
+            return UnsafeAsT(ConvertAsPackedInt64ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(float[]))
+            return UnsafeAsT(ConvertAsPackedFloat32ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(double[]))
+            return UnsafeAsT(ConvertAsPackedFloat64ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(string[]))
+            return UnsafeAsT(ConvertAsPackedStringArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(Vector2[]))
+            return UnsafeAsT(ConvertAsPackedVector2ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(Vector3[]))
+            return UnsafeAsT(ConvertAsPackedVector3ArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(Color[]))
+            return UnsafeAsT(ConvertAsPackedColorArrayToSystemArray(variant));
+
+        if (typeof(T) == typeof(StringName[]))
+            return UnsafeAsT(ConvertToSystemArrayOfStringName(variant));
+
+        if (typeof(T) == typeof(NodePath[]))
+            return UnsafeAsT(ConvertToSystemArrayOfNodePath(variant));
+
+        if (typeof(T) == typeof(RID[]))
+            return UnsafeAsT(ConvertToSystemArrayOfRID(variant));
+
+        if (typeof(T) == typeof(StringName))
+            return UnsafeAsT(ConvertToStringNameObject(variant));
+
+        if (typeof(T) == typeof(NodePath))
+            return UnsafeAsT(ConvertToNodePathObject(variant));
+
+        if (typeof(T) == typeof(RID))
+            return UnsafeAsT(ConvertToRID(variant));
+
+        if (typeof(T) == typeof(Godot.Collections.Dictionary))
+            return UnsafeAsT(ConvertToDictionaryObject(variant));
+
+        if (typeof(T) == typeof(Godot.Collections.Array))
+            return UnsafeAsT(ConvertToArrayObject(variant));
+
+        if (typeof(T) == typeof(Variant))
+            return UnsafeAsT(Variant.CreateCopyingBorrowed(variant));
+
+        // More complex checks here at the end, to avoid screwing the simple ones in case they're not optimized away.
+
+        // `typeof(X).IsAssignableFrom(typeof(T))` is optimized away
+
+        if (typeof(Godot.Object).IsAssignableFrom(typeof(T)))
+            return (T)(object)ConvertToGodotObject(variant);
+
+        // `typeof(T).IsValueType` is optimized away
+        // `typeof(T).IsEnum` is NOT optimized away: https://github.com/dotnet/runtime/issues/67113
+        // Fortunately, `typeof(System.Enum).IsAssignableFrom(typeof(T))` does the job!
+
+        if (typeof(T).IsValueType && typeof(System.Enum).IsAssignableFrom(typeof(T)))
+        {
+            // `Type.GetTypeCode(typeof(T).GetEnumUnderlyingType())` is not optimized away.
+            // Fortunately, `Unsafe.SizeOf<T>()` works and is optimized away.
+            // We don't need to know whether it's signed or unsigned.
+
+            if (Unsafe.SizeOf<T>() == 1)
+                return UnsafeAsT(ConvertToInt8(variant));
+
+            if (Unsafe.SizeOf<T>() == 2)
+                return UnsafeAsT(ConvertToInt16(variant));
+
+            if (Unsafe.SizeOf<T>() == 4)
+                return UnsafeAsT(ConvertToInt32(variant));
+
+            if (Unsafe.SizeOf<T>() == 8)
+                return UnsafeAsT(ConvertToInt64(variant));
+
+            throw UnsupportedType<T>();
+        }
+
+        return GenericConversion<T>.FromVariant(variant);
+    }
+}
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Projection.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Projection.cs
index 371729ebec..8b1b73fcc3 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Projection.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Projection.cs
@@ -3,6 +3,14 @@ using System.Runtime.InteropServices;
 
 namespace Godot
 {
+    /// <summary>
+    /// A 4x4 matrix used for 3D projective transformations. It can represent transformations such as
+    /// translation, rotation, scaling, shearing, and perspective division. It consists of four
+    /// <see cref="Vector4"/> columns.
+    /// For purely linear transformations (translation, rotation, and scale), it is recommended to use
+    /// <see cref="Transform3D"/>, as it is more performant and has a lower memory footprint.
+    /// Used internally as <see cref="Camera3D"/>'s projection matrix.
+    /// </summary>
     [Serializable]
     [StructLayout(LayoutKind.Sequential)]
     public struct Projection : IEquatable<Projection>
@@ -59,48 +67,107 @@ namespace Godot
         public Vector4 w;
 
         /// <summary>
-        /// Constructs a projection from 4 vectors (matrix columns).
+        /// Access whole columns in the form of <see cref="Vector4"/>.
         /// </summary>
-        /// <param name="x">The X column, or column index 0.</param>
-        /// <param name="y">The Y column, or column index 1.</param>
-        /// <param name="z">The Z column, or column index 2.</param>
-        /// <param name="w">The W column, or column index 3.</param>
-        public Projection(Vector4 x, Vector4 y, Vector4 z, Vector4 w)
+        /// <param name="column">Which column vector.</param>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="column"/> is not 0, 1, 2 or 3.
+        /// </exception>
+        public Vector4 this[int column]
         {
-            this.x = x;
-            this.y = y;
-            this.z = z;
-            this.w = w;
+            readonly get
+            {
+                switch (column)
+                {
+                    case 0:
+                        return x;
+                    case 1:
+                        return y;
+                    case 2:
+                        return z;
+                    case 3:
+                        return w;
+                    default:
+                        throw new ArgumentOutOfRangeException(nameof(column));
+                }
+            }
+            set
+            {
+                switch (column)
+                {
+                    case 0:
+                        x = value;
+                        return;
+                    case 1:
+                        y = value;
+                        return;
+                    case 2:
+                        z = value;
+                        return;
+                    case 3:
+                        w = value;
+                        return;
+                    default:
+                        throw new ArgumentOutOfRangeException(nameof(column));
+                }
+            }
         }
 
         /// <summary>
-        /// Constructs a new <see cref="Projection"/> from a <see cref="Transform3D"/>.
+        /// Access single values.
         /// </summary>
-        /// <param name="transform">The <see cref="Transform3D"/>.</param>
-        public Projection(Transform3D transform)
+        /// <param name="column">Which column vector.</param>
+        /// <param name="row">Which row of the column.</param>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="column"/> or <paramref name="row"/> are not 0, 1, 2 or 3.
+        /// </exception>
+        public real_t this[int column, int row]
         {
-            x = new Vector4(transform.basis.Row0.x, transform.basis.Row1.x, transform.basis.Row2.x, 0);
-            y = new Vector4(transform.basis.Row0.y, transform.basis.Row1.y, transform.basis.Row2.y, 0);
-            z = new Vector4(transform.basis.Row0.z, transform.basis.Row1.z, transform.basis.Row2.z, 0);
-            w = new Vector4(transform.origin.x, transform.origin.y, transform.origin.z, 1);
+            readonly get
+            {
+                switch (column)
+                {
+                    case 0:
+                        return x[row];
+                    case 1:
+                        return y[row];
+                    case 2:
+                        return z[row];
+                    case 3:
+                        return w[row];
+                    default:
+                        throw new ArgumentOutOfRangeException(nameof(column));
+                }
+            }
+            set
+            {
+                switch (column)
+                {
+                    case 0:
+                        x[row] = value;
+                        return;
+                    case 1:
+                        y[row] = value;
+                        return;
+                    case 2:
+                        z[row] = value;
+                        return;
+                    case 3:
+                        w[row] = value;
+                        return;
+                    default:
+                        throw new ArgumentOutOfRangeException(nameof(column));
+                }
+            }
         }
 
         /// <summary>
-        /// Constructs a new <see cref="Transform3D"/> from the <see cref="Projection"/>.
+        /// Creates a new <see cref="Projection"/> that projects positions from a depth range of
+        /// <c>-1</c> to <c>1</c> to one that ranges from <c>0</c> to <c>1</c>, and flips the projected
+        /// positions vertically, according to <paramref name="flipY"/>.
         /// </summary>
-        /// <param name="proj">The <see cref="Projection"/>.</param>
-        public static explicit operator Transform3D(Projection proj)
-        {
-            return new Transform3D(
-                new Basis(
-                    new Vector3(proj.x.x, proj.x.y, proj.x.z),
-                    new Vector3(proj.y.x, proj.y.y, proj.y.z),
-                    new Vector3(proj.z.x, proj.z.y, proj.z.z)
-                ),
-                new Vector3(proj.w.x, proj.w.y, proj.w.z)
-            );
-        }
-
+        /// <param name="flipY">If the projection should be flipped vertically.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateDepthCorrection(bool flipY)
         {
             return new Projection(
@@ -111,6 +178,12 @@ namespace Godot
             );
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that scales a given projection to fit around
+        /// a given <see cref="AABB"/> in projection space.
+        /// </summary>
+        /// <param name="aabb">The AABB to fit the projection around.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateFitAabb(AABB aabb)
         {
             Vector3 min = aabb.Position;
@@ -124,6 +197,25 @@ namespace Godot
             );
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> for projecting positions onto a head-mounted display with
+        /// the given X:Y aspect ratio, distance between eyes, display width, distance to lens, oversampling factor,
+        /// and depth clipping planes.
+        /// <paramref name="eye"/> creates the projection for the left eye when set to 1,
+        /// or the right eye when set to 2.
+        /// </summary>
+        /// <param name="eye">
+        /// The eye to create the projection for.
+        /// The left eye when set to 1, the right eye when set to 2.
+        /// </param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <param name="intraocularDist">The distance between the eyes.</param>
+        /// <param name="displayWidth">The display width.</param>
+        /// <param name="displayToLens">The distance to the lens.</param>
+        /// <param name="oversample">The oversampling factor.</param>
+        /// <param name="zNear">The near clipping distance.</param>
+        /// <param name="zFar">The far clipping distance.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateForHmd(int eye, real_t aspect, real_t intraocularDist, real_t displayWidth, real_t displayToLens, real_t oversample, real_t zNear, real_t zFar)
         {
             real_t f1 = (intraocularDist * (real_t)0.5) / displayToLens;
@@ -148,6 +240,17 @@ namespace Godot
             }
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions in a frustum with
+        /// the given clipping planes.
+        /// </summary>
+        /// <param name="left">The left clipping distance.</param>
+        /// <param name="right">The right clipping distance.</param>
+        /// <param name="bottom">The bottom clipping distance.</param>
+        /// <param name="top">The top clipping distance.</param>
+        /// <param name="near">The near clipping distance.</param>
+        /// <param name="far">The far clipping distance.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateFrustum(real_t left, real_t right, real_t bottom, real_t top, real_t near, real_t far)
         {
             if (right <= left)
@@ -179,6 +282,18 @@ namespace Godot
             );
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions in a frustum with
+        /// the given size, X:Y aspect ratio, offset, and clipping planes.
+        /// <paramref name="flipFov"/> determines whether the projection's field of view is flipped over its diagonal.
+        /// </summary>
+        /// <param name="size">The frustum size.</param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <param name="offset">The offset to apply.</param>
+        /// <param name="near">The near clipping distance.</param>
+        /// <param name="far">The far clipping distance.</param>
+        /// <param name="flipFov">If the field of view is flipped over the projection's diagonal.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateFrustumAspect(real_t size, real_t aspect, Vector2 offset, real_t near, real_t far, bool flipFov)
         {
             if (!flipFov)
@@ -188,6 +303,11 @@ namespace Godot
             return CreateFrustum(-size / 2 + offset.x, +size / 2 + offset.x, -size / aspect / 2 + offset.y, +size / aspect / 2 + offset.y, near, far);
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions into the given <see cref="Rect2"/>.
+        /// </summary>
+        /// <param name="rect">The Rect2 to project positions into.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateLightAtlasRect(Rect2 rect)
         {
             return new Projection(
@@ -198,6 +318,17 @@ namespace Godot
             );
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions using an orthogonal projection with
+        /// the given clipping planes.
+        /// </summary>
+        /// <param name="left">The left clipping distance.</param>
+        /// <param name="right">The right clipping distance.</param>
+        /// <param name="bottom">The bottom clipping distance.</param>
+        /// <param name="top">The top clipping distance.</param>
+        /// <param name="zNear">The near clipping distance.</param>
+        /// <param name="zFar">The far clipping distance.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateOrthogonal(real_t left, real_t right, real_t bottom, real_t top, real_t zNear, real_t zFar)
         {
             Projection proj = Projection.Identity;
@@ -211,6 +342,17 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions using an orthogonal projection with
+        /// the given size, X:Y aspect ratio, and clipping planes.
+        /// <paramref name="flipFov"/> determines whether the projection's field of view is flipped over its diagonal.
+        /// </summary>
+        /// <param name="size">The frustum size.</param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <param name="zNear">The near clipping distance.</param>
+        /// <param name="zFar">The far clipping distance.</param>
+        /// <param name="flipFov">If the field of view is flipped over the projection's diagonal.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreateOrthogonalAspect(real_t size, real_t aspect, real_t zNear, real_t zFar, bool flipFov)
         {
             if (!flipFov)
@@ -220,6 +362,17 @@ namespace Godot
             return CreateOrthogonal(-size / 2, +size / 2, -size / aspect / 2, +size / aspect / 2, zNear, zFar);
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions using a perspective projection with
+        /// the given Y-axis field of view (in degrees), X:Y aspect ratio, and clipping planes.
+        /// <paramref name="flipFov"/> determines whether the projection's field of view is flipped over its diagonal.
+        /// </summary>
+        /// <param name="fovyDegrees">The vertical field of view (in degrees).</param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <param name="zNear">The near clipping distance.</param>
+        /// <param name="zFar">The far clipping distance.</param>
+        /// <param name="flipFov">If the field of view is flipped over the projection's diagonal.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreatePerspective(real_t fovyDegrees, real_t aspect, real_t zNear, real_t zFar, bool flipFov)
         {
             if (flipFov)
@@ -249,6 +402,27 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Creates a new <see cref="Projection"/> that projects positions using a perspective projection with
+        /// the given Y-axis field of view (in degrees), X:Y aspect ratio, and clipping distances.
+        /// The projection is adjusted for a head-mounted display with the given distance between eyes and distance
+        /// to a point that can be focused on.
+        /// <paramref name="eye"/> creates the projection for the left eye when set to 1,
+        /// or the right eye when set to 2.
+        /// <paramref name="flipFov"/> determines whether the projection's field of view is flipped over its diagonal.
+        /// </summary>
+        /// <param name="fovyDegrees">The vertical field of view (in degrees).</param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <param name="zNear">The near clipping distance.</param>
+        /// <param name="zFar">The far clipping distance.</param>
+        /// <param name="flipFov">If the field of view is flipped over the projection's diagonal.</param>
+        /// <param name="eye">
+        /// The eye to create the projection for.
+        /// The left eye when set to 1, the right eye when set to 2.
+        /// </param>
+        /// <param name="intraocularDist">The distance between the eyes.</param>
+        /// <param name="convergenceDist">The distance to a point of convergence that can be focused on.</param>
+        /// <returns>The created projection.</returns>
         public static Projection CreatePerspectiveHmd(real_t fovyDegrees, real_t aspect, real_t zNear, real_t zFar, bool flipFov, int eye, real_t intraocularDist, real_t convergenceDist)
         {
             if (flipFov)
@@ -286,6 +460,13 @@ namespace Godot
             return proj * cm;
         }
 
+        /// <summary>
+        /// Returns a scalar value that is the signed factor by which areas are scaled by this matrix.
+        /// If the sign is negative, the matrix flips the orientation of the area.
+        /// The determinant can be used to calculate the invertibility of a matrix or solve linear systems
+        /// of equations involving the matrix, among other applications.
+        /// </summary>
+        /// <returns>The determinant calculated from this projection.</returns>
         public readonly real_t Determinant()
         {
             return x.w * y.z * z.y * w.x - x.z * y.w * z.y * w.x -
@@ -302,12 +483,20 @@ namespace Godot
                    x.y * y.x * z.z * w.w + x.x * y.y * z.z * w.w;
         }
 
+        /// <summary>
+        /// Returns the X:Y aspect ratio of this <see cref="Projection"/>'s viewport.
+        /// </summary>
+        /// <returns>The aspect ratio from this projection's viewport.</returns>
         public readonly real_t GetAspect()
         {
             Vector2 vpHe = GetViewportHalfExtents();
             return vpHe.x / vpHe.y;
         }
 
+        /// <summary>
+        /// Returns the horizontal field of view of the projection (in degrees).
+        /// </summary>
+        /// <returns>The horizontal field of view of this projection.</returns>
         public readonly real_t GetFov()
         {
             Plane rightPlane = new Plane(x.w - x.x, y.w - y.x, z.w - z.x, -w.w + w.x).Normalized();
@@ -322,11 +511,22 @@ namespace Godot
             }
         }
 
+        /// <summary>
+        /// Returns the vertical field of view of the projection (in degrees) associated with
+        /// the given horizontal field of view (in degrees) and aspect ratio.
+        /// </summary>
+        /// <param name="fovx">The horizontal field of view (in degrees).</param>
+        /// <param name="aspect">The aspect ratio.</param>
+        /// <returns>The vertical field of view of this projection.</returns>
         public static real_t GetFovy(real_t fovx, real_t aspect)
         {
             return Mathf.RadToDeg(Mathf.Atan(aspect * Mathf.Tan(Mathf.DegToRad(fovx) * (real_t)0.5)) * (real_t)2.0);
         }
 
+        /// <summary>
+        /// Returns the factor by which the visible level of detail is scaled by this <see cref="Projection"/>.
+        /// </summary>
+        /// <returns>The level of detail factor for this projection.</returns>
         public readonly real_t GetLodMultiplier()
         {
             if (IsOrthogonal())
@@ -341,6 +541,12 @@ namespace Godot
             }
         }
 
+        /// <summary>
+        /// Returns the number of pixels with the given pixel width displayed per meter, after
+        /// this <see cref="Projection"/> is applied.
+        /// </summary>
+        /// <param name="forPixelWidth">The width for each pixel (in meters).</param>
+        /// <returns>The number of pixels per meter.</returns>
         public readonly int GetPixelsPerMeter(int forPixelWidth)
         {
             Vector3 result = this * new Vector3(1, 0, -1);
@@ -348,6 +554,15 @@ namespace Godot
             return (int)((result.x * (real_t)0.5 + (real_t)0.5) * forPixelWidth);
         }
 
+        /// <summary>
+        /// Returns the clipping plane of this <see cref="Projection"/> whose index is given
+        /// by <paramref name="plane"/>.
+        /// <paramref name="plane"/> should be equal to one of <see cref="Planes.Near"/>,
+        /// <see cref="Planes.Far"/>, <see cref="Planes.Left"/>, <see cref="Planes.Top"/>,
+        /// <see cref="Planes.Right"/>, or <see cref="Planes.Bottom"/>.
+        /// </summary>
+        /// <param name="plane">The kind of clipping plane to get from the projection.</param>
+        /// <returns>The clipping plane of this projection.</returns>
         public readonly Plane GetProjectionPlane(Planes plane)
         {
             Plane newPlane = plane switch
@@ -364,28 +579,49 @@ namespace Godot
             return newPlane.Normalized();
         }
 
+        /// <summary>
+        /// Returns the dimensions of the far clipping plane of the projection, divided by two.
+        /// </summary>
+        /// <returns>The half extents for this projection's far plane.</returns>
         public readonly Vector2 GetFarPlaneHalfExtents()
         {
             var res = GetProjectionPlane(Planes.Far).Intersect3(GetProjectionPlane(Planes.Right), GetProjectionPlane(Planes.Top));
             return new Vector2(res.Value.x, res.Value.y);
         }
 
+        /// <summary>
+        /// Returns the dimensions of the viewport plane that this <see cref="Projection"/>
+        /// projects positions onto, divided by two.
+        /// </summary>
+        /// <returns>The half extents for this projection's viewport plane.</returns>
         public readonly Vector2 GetViewportHalfExtents()
         {
             var res = GetProjectionPlane(Planes.Near).Intersect3(GetProjectionPlane(Planes.Right), GetProjectionPlane(Planes.Top));
             return new Vector2(res.Value.x, res.Value.y);
         }
 
+        /// <summary>
+        /// Returns the distance for this <see cref="Projection"/> beyond which positions are clipped.
+        /// </summary>
+        /// <returns>The distance beyond which positions are clipped.</returns>
         public readonly real_t GetZFar()
         {
             return GetProjectionPlane(Planes.Far).D;
         }
 
+        /// <summary>
+        /// Returns the distance for this <see cref="Projection"/> before which positions are clipped.
+        /// </summary>
+        /// <returns>The distance before which positions are clipped.</returns>
         public readonly real_t GetZNear()
         {
             return -GetProjectionPlane(Planes.Near).D;
         }
 
+        /// <summary>
+        /// Returns a copy of this <see cref="Projection"/> with the signs of the values of the Y column flipped.
+        /// </summary>
+        /// <returns>The flipped projection.</returns>
         public readonly Projection FlippedY()
         {
             Projection proj = this;
@@ -393,6 +629,13 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Returns a <see cref="Projection"/> with the near clipping distance adjusted to be
+        /// <paramref name="newZNear"/>.
+        /// Note: The original <see cref="Projection"/> must be a perspective projection.
+        /// </summary>
+        /// <param name="newZNear">The near clipping distance to adjust the projection to.</param>
+        /// <returns>The adjusted projection.</returns>
         public readonly Projection PerspectiveZNearAdjusted(real_t newZNear)
         {
             Projection proj = this;
@@ -404,6 +647,12 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Returns a <see cref="Projection"/> with the X and Y values from the given <see cref="Vector2"/>
+        /// added to the first and second values of the final column respectively.
+        /// </summary>
+        /// <param name="offset">The offset to apply to the projection.</param>
+        /// <returns>The offseted projection.</returns>
         public readonly Projection JitterOffseted(Vector2 offset)
         {
             Projection proj = this;
@@ -412,6 +661,11 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Returns a <see cref="Projection"/> that performs the inverse of this <see cref="Projection"/>'s
+        /// projective transformation.
+        /// </summary>
+        /// <returns>The inverted projection.</returns>
         public readonly Projection Inverse()
         {
             Projection proj = this;
@@ -535,11 +789,70 @@ namespace Godot
             return proj;
         }
 
+        /// <summary>
+        /// Returns <see langword="true"/> if this <see cref="Projection"/> performs an orthogonal projection.
+        /// </summary>
+        /// <returns>If the projection performs an orthogonal projection.</returns>
         public readonly bool IsOrthogonal()
         {
             return w.w == (real_t)1.0;
         }
 
+        // Constants
+        private static readonly Projection _zero = new Projection(
+            new Vector4(0, 0, 0, 0),
+            new Vector4(0, 0, 0, 0),
+            new Vector4(0, 0, 0, 0),
+            new Vector4(0, 0, 0, 0)
+        );
+        private static readonly Projection _identity = new Projection(
+            new Vector4(1, 0, 0, 0),
+            new Vector4(0, 1, 0, 0),
+            new Vector4(0, 0, 1, 0),
+            new Vector4(0, 0, 0, 1)
+        );
+
+        /// <summary>
+        /// Zero projection, a projection with all components set to <c>0</c>.
+        /// </summary>
+        /// <value>Equivalent to <c>new Projection(Vector4.Zero, Vector4.Zero, Vector4.Zero, Vector4.Zero)</c>.</value>
+        public static Projection Zero { get { return _zero; } }
+
+        /// <summary>
+        /// The identity projection, with no distortion applied.
+        /// This is used as a replacement for <c>Projection()</c> in GDScript.
+        /// Do not use <c>new Projection()</c> with no arguments in C#, because it sets all values to zero.
+        /// </summary>
+        /// <value>Equivalent to <c>new Projection(new Vector4(1, 0, 0, 0), new Vector4(0, 1, 0, 0), new Vector4(0, 0, 1, 0), new Vector4(0, 0, 0, 1))</c>.</value>
+        public static Projection Identity { get { return _identity; } }
+
+        /// <summary>
+        /// Constructs a projection from 4 vectors (matrix columns).
+        /// </summary>
+        /// <param name="x">The X column, or column index 0.</param>
+        /// <param name="y">The Y column, or column index 1.</param>
+        /// <param name="z">The Z column, or column index 2.</param>
+        /// <param name="w">The W column, or column index 3.</param>
+        public Projection(Vector4 x, Vector4 y, Vector4 z, Vector4 w)
+        {
+            this.x = x;
+            this.y = y;
+            this.z = z;
+            this.w = w;
+        }
+
+        /// <summary>
+        /// Constructs a new <see cref="Projection"/> from a <see cref="Transform3D"/>.
+        /// </summary>
+        /// <param name="transform">The <see cref="Transform3D"/>.</param>
+        public Projection(Transform3D transform)
+        {
+            x = new Vector4(transform.basis.Row0.x, transform.basis.Row1.x, transform.basis.Row2.x, 0);
+            y = new Vector4(transform.basis.Row0.y, transform.basis.Row1.y, transform.basis.Row2.y, 0);
+            z = new Vector4(transform.basis.Row0.z, transform.basis.Row1.z, transform.basis.Row2.z, 0);
+            w = new Vector4(transform.origin.x, transform.origin.y, transform.origin.z, 1);
+        }
+
         /// <summary>
         /// Composes these two projections by multiplying them
         /// together. This has the effect of applying the right
@@ -646,127 +959,41 @@ namespace Godot
         }
 
         /// <summary>
-        /// Access whole columns in the form of <see cref="Vector4"/>.
+        /// Constructs a new <see cref="Transform3D"/> from the <see cref="Projection"/>.
         /// </summary>
-        /// <param name="column">Which column vector.</param>
-        /// <exception cref="ArgumentOutOfRangeException">
-        /// <paramref name="column"/> is not 0, 1, 2 or 3.
-        /// </exception>
-        public Vector4 this[int column]
+        /// <param name="proj">The <see cref="Projection"/>.</param>
+        public static explicit operator Transform3D(Projection proj)
         {
-            readonly get
-            {
-                switch (column)
-                {
-                    case 0:
-                        return x;
-                    case 1:
-                        return y;
-                    case 2:
-                        return z;
-                    case 3:
-                        return w;
-                    default:
-                        throw new ArgumentOutOfRangeException(nameof(column));
-                }
-            }
-            set
-            {
-                switch (column)
-                {
-                    case 0:
-                        x = value;
-                        return;
-                    case 1:
-                        y = value;
-                        return;
-                    case 2:
-                        z = value;
-                        return;
-                    case 3:
-                        w = value;
-                        return;
-                    default:
-                        throw new ArgumentOutOfRangeException(nameof(column));
-                }
-            }
+            return new Transform3D(
+                new Basis(
+                    new Vector3(proj.x.x, proj.x.y, proj.x.z),
+                    new Vector3(proj.y.x, proj.y.y, proj.y.z),
+                    new Vector3(proj.z.x, proj.z.y, proj.z.z)
+                ),
+                new Vector3(proj.w.x, proj.w.y, proj.w.z)
+            );
         }
 
         /// <summary>
-        /// Access single values.
+        /// Returns <see langword="true"/> if the projection is exactly equal
+        /// to the given object (<see paramref="obj"/>).
         /// </summary>
-        /// <param name="column">Which column vector.</param>
-        /// <param name="row">Which row of the column.</param>
-        /// <exception cref="ArgumentOutOfRangeException">
-        /// <paramref name="column"/> or <paramref name="row"/> are not 0, 1, 2 or 3.
-        /// </exception>
-        public real_t this[int column, int row]
+        /// <param name="obj">The object to compare with.</param>
+        /// <returns>Whether or not the vector and the object are equal.</returns>
+        public override readonly bool Equals(object obj)
         {
-            readonly get
-            {
-                switch (column)
-                {
-                    case 0:
-                        return x[row];
-                    case 1:
-                        return y[row];
-                    case 2:
-                        return z[row];
-                    case 3:
-                        return w[row];
-                    default:
-                        throw new ArgumentOutOfRangeException(nameof(column));
-                }
-            }
-            set
-            {
-                switch (column)
-                {
-                    case 0:
-                        x[row] = value;
-                        return;
-                    case 1:
-                        y[row] = value;
-                        return;
-                    case 2:
-                        z[row] = value;
-                        return;
-                    case 3:
-                        w[row] = value;
-                        return;
-                    default:
-                        throw new ArgumentOutOfRangeException(nameof(column));
-                }
-            }
+            return obj is Projection other && Equals(other);
         }
 
-        // Constants
-        private static readonly Projection _zero = new Projection(
-            new Vector4(0, 0, 0, 0),
-            new Vector4(0, 0, 0, 0),
-            new Vector4(0, 0, 0, 0),
-            new Vector4(0, 0, 0, 0)
-        );
-        private static readonly Projection _identity = new Projection(
-            new Vector4(1, 0, 0, 0),
-            new Vector4(0, 1, 0, 0),
-            new Vector4(0, 0, 1, 0),
-            new Vector4(0, 0, 0, 1)
-        );
-
-        /// <summary>
-        /// Zero projection, a projection with all components set to <c>0</c>.
-        /// </summary>
-        /// <value>Equivalent to <c>new Projection(Vector4.Zero, Vector4.Zero, Vector4.Zero, Vector4.Zero)</c>.</value>
-        public static Projection Zero { get { return _zero; } }
-
         /// <summary>
-        /// The identity projection, with no distortion applied.
-        /// This is used as a replacement for <c>Projection()</c> in GDScript.
-        /// Do not use <c>new Projection()</c> with no arguments in C#, because it sets all values to zero.
+        /// Returns <see langword="true"/> if the projections are exactly equal.
         /// </summary>
-        /// <value>Equivalent to <c>new Projection(new Vector4(1, 0, 0, 0), new Vector4(0, 1, 0, 0), new Vector4(0, 0, 1, 0), new Vector4(0, 0, 0, 1))</c>.</value>
-        public static Projection Identity { get { return _identity; } }
+        /// <param name="other">The other projection.</param>
+        /// <returns>Whether or not the projections are exactly equal.</returns>
+        public readonly bool Equals(Projection other)
+        {
+            return x == other.x && y == other.y && z == other.z && w == other.w;
+        }
 
         /// <summary>
         /// Serves as the hash function for <see cref="Projection"/>.
@@ -797,26 +1024,5 @@ namespace Godot
                 $"{z.x.ToString(format)}, {z.y.ToString(format)}, {z.z.ToString(format)}, {z.w.ToString(format)}\n" +
                 $"{w.x.ToString(format)}, {w.y.ToString(format)}, {w.z.ToString(format)}, {w.w.ToString(format)}\n";
         }
-
-        /// <summary>
-        /// Returns <see langword="true"/> if the projection is exactly equal
-        /// to the given object (<see paramref="obj"/>).
-        /// </summary>
-        /// <param name="obj">The object to compare with.</param>
-        /// <returns>Whether or not the vector and the object are equal.</returns>
-        public override readonly bool Equals(object obj)
-        {
-            return obj is Projection other && Equals(other);
-        }
-
-        /// <summary>
-        /// Returns <see langword="true"/> if the projections are exactly equal.
-        /// </summary>
-        /// <param name="other">The other projection.</param>
-        /// <returns>Whether or not the projections are exactly equal.</returns>
-        public readonly bool Equals(Projection other)
-        {
-            return x == other.x && y == other.y && z == other.z && w == other.w;
-        }
     }
 }
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Variant.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Variant.cs
index d354509dbf..237a4da364 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Variant.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Variant.cs
@@ -121,6 +121,14 @@ public partial struct Variant : IDisposable
     }
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Variant From<[MustBeVariant] T>(in T from) =>
+        CreateTakingOwnershipOfDisposableValue(VariantUtils.CreateFrom(from));
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public T As<[MustBeVariant] T>() =>
+        VariantUtils.ConvertTo<T>(NativeVar.DangerousSelfRef);
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public bool AsBool() =>
         VariantUtils.ConvertToBool((godot_variant)NativeVar);
 
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
index 535391f447..c471eceded 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
@@ -240,7 +240,7 @@ namespace Godot
 
         /// <summary>
         /// Returns the point at the given <paramref name="t"/> on a one-dimensional Bezier curve defined by this vector
-        /// and the given <paramref name="control1"/>, <paramref name="control2"/> and <paramref name="end"/> points.
+        /// and the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
         /// </summary>
         /// <param name="control1">Control point that defines the bezier curve.</param>
         /// <param name="control2">Control point that defines the bezier curve.</param>
@@ -249,14 +249,28 @@ namespace Godot
         /// <returns>The interpolated vector.</returns>
         public readonly Vector2 BezierInterpolate(Vector2 control1, Vector2 control2, Vector2 end, real_t t)
         {
-            // Formula from Wikipedia article on Bezier curves
-            real_t omt = 1 - t;
-            real_t omt2 = omt * omt;
-            real_t omt3 = omt2 * omt;
-            real_t t2 = t * t;
-            real_t t3 = t2 * t;
+            return new Vector2
+            (
+                Mathf.BezierInterpolate(x, control1.x, control2.x, end.x, t),
+                Mathf.BezierInterpolate(y, control1.y, control2.y, end.y, t)
+            );
+        }
 
-            return this * omt3 + control1 * omt2 * t * 3 + control2 * omt * t2 * 3 + end * t3;
+        /// <summary>
+        /// Returns the derivative at the given <paramref name="t"/> on the Bezier curve defined by this vector
+        /// and the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
+        /// </summary>
+        /// <param name="control1">Control point that defines the bezier curve.</param>
+        /// <param name="control2">Control point that defines the bezier curve.</param>
+        /// <param name="end">The destination value for the interpolation.</param>
+        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <returns>The resulting value of the interpolation.</returns>
+        public readonly Vector2 BezierDerivative(Vector2 control1, Vector2 control2, Vector2 end, real_t t)
+        {
+            return new Vector2(
+                Mathf.BezierDerivative(x, control1.x, control2.x, end.x, t),
+                Mathf.BezierDerivative(y, control1.y, control2.y, end.y, t)
+            );
         }
 
         /// <summary>
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
index 53bd0b0908..fefdee33a5 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
@@ -234,7 +234,7 @@ namespace Godot
 
         /// <summary>
         /// Returns the point at the given <paramref name="t"/> on a one-dimensional Bezier curve defined by this vector
-        /// and the given <paramref name="control1"/>, <paramref name="control2"/> and <paramref name="end"/> points.
+        /// and the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
         /// </summary>
         /// <param name="control1">Control point that defines the bezier curve.</param>
         /// <param name="control2">Control point that defines the bezier curve.</param>
@@ -243,14 +243,30 @@ namespace Godot
         /// <returns>The interpolated vector.</returns>
         public readonly Vector3 BezierInterpolate(Vector3 control1, Vector3 control2, Vector3 end, real_t t)
         {
-            // Formula from Wikipedia article on Bezier curves
-            real_t omt = 1 - t;
-            real_t omt2 = omt * omt;
-            real_t omt3 = omt2 * omt;
-            real_t t2 = t * t;
-            real_t t3 = t2 * t;
+            return new Vector3
+            (
+                Mathf.BezierInterpolate(x, control1.x, control2.x, end.x, t),
+                Mathf.BezierInterpolate(y, control1.y, control2.y, end.y, t),
+                Mathf.BezierInterpolate(z, control1.z, control2.z, end.z, t)
+            );
+        }
 
-            return this * omt3 + control1 * omt2 * t * 3 + control2 * omt * t2 * 3 + end * t3;
+        /// <summary>
+        /// Returns the derivative at the given <paramref name="t"/> on the Bezier curve defined by this vector
+        /// and the given <paramref name="control1"/>, <paramref name="control2"/>, and <paramref name="end"/> points.
+        /// </summary>
+        /// <param name="control1">Control point that defines the bezier curve.</param>
+        /// <param name="control2">Control point that defines the bezier curve.</param>
+        /// <param name="end">The destination value for the interpolation.</param>
+        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <returns>The resulting value of the interpolation.</returns>
+        public readonly Vector3 BezierDerivative(Vector3 control1, Vector3 control2, Vector3 end, real_t t)
+        {
+            return new Vector3(
+                Mathf.BezierDerivative(x, control1.x, control2.x, end.x, t),
+                Mathf.BezierDerivative(y, control1.y, control2.y, end.y, t),
+                Mathf.BezierDerivative(z, control1.z, control2.z, end.y, t)
+            );
         }
 
         /// <summary>
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/GodotSharp.csproj b/modules/mono/glue/GodotSharp/GodotSharp/GodotSharp.csproj
index e3fb254f49..85f7e36639 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/GodotSharp.csproj
+++ b/modules/mono/glue/GodotSharp/GodotSharp/GodotSharp.csproj
@@ -101,9 +101,9 @@
     <Compile Include="Core\NativeInterop\InteropUtils.cs" />
     <Compile Include="Core\NativeInterop\NativeFuncs.extended.cs" />
     <Compile Include="Core\NativeInterop\NativeVariantPtrArgs.cs" />
-    <Compile Include="Core\NativeInterop\VariantConversionCallbacks.cs" />
     <Compile Include="Core\NativeInterop\VariantSpanHelpers.cs" />
     <Compile Include="Core\NativeInterop\VariantUtils.cs" />
+    <Compile Include="Core\NativeInterop\VariantUtils.generic.cs" />
     <Compile Include="Core\NodePath.cs" />
     <Compile Include="Core\Object.base.cs" />
     <Compile Include="Core\Object.exceptions.cs" />
@@ -123,6 +123,7 @@
     <Compile Include="Core\StringName.cs" />
     <Compile Include="Core\Transform2D.cs" />
     <Compile Include="Core\Transform3D.cs" />
+    <Compile Include="Core\Variant.cs" />
     <Compile Include="Core\Vector2.cs" />
     <Compile Include="Core\Vector2i.cs" />
     <Compile Include="Core\Vector3.cs" />
@@ -131,7 +132,6 @@
     <Compile Include="Core\Vector4i.cs" />
     <Compile Include="GlobalUsings.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
-    <Compile Include="Variant.cs" />
   </ItemGroup>
   <!--
   We import a props file with auto-generated includes. This works well with Rider.
diff --git a/modules/openxr/extensions/openxr_opengl_extension.cpp b/modules/openxr/extensions/openxr_opengl_extension.cpp
index ee69144123..569030cc11 100644
--- a/modules/openxr/extensions/openxr_opengl_extension.cpp
+++ b/modules/openxr/extensions/openxr_opengl_extension.cpp
@@ -160,16 +160,8 @@ void *OpenXROpenGLExtension::set_session_create_and_get_next_pointer(void *p_nex
 }
 
 void OpenXROpenGLExtension::get_usable_swapchain_formats(Vector<int64_t> &p_usable_swap_chains) {
-#ifdef WIN32
-	p_usable_swap_chains.push_back(GL_SRGB8_ALPHA8);
-	p_usable_swap_chains.push_back(GL_RGBA8);
-#elif ANDROID_ENABLED
 	p_usable_swap_chains.push_back(GL_SRGB8_ALPHA8);
 	p_usable_swap_chains.push_back(GL_RGBA8);
-#else
-	p_usable_swap_chains.push_back(GL_SRGB8_ALPHA8_EXT);
-	p_usable_swap_chains.push_back(GL_RGBA8_EXT);
-#endif
 }
 
 void OpenXROpenGLExtension::get_usable_depth_formats(Vector<int64_t> &p_usable_depth_formats) {
@@ -294,59 +286,7 @@ void OpenXROpenGLExtension::cleanup_swapchain_graphics_data(void **p_swapchain_g
 String OpenXROpenGLExtension::get_swapchain_format_name(int64_t p_swapchain_format) const {
 	// These are somewhat different per platform, will need to weed some stuff out...
 	switch (p_swapchain_format) {
-#ifdef WIN32
-		// using definitions from GLAD
-		ENUM_TO_STRING_CASE(GL_R8_SNORM)
-		ENUM_TO_STRING_CASE(GL_RG8_SNORM)
-		ENUM_TO_STRING_CASE(GL_RGB8_SNORM)
-		ENUM_TO_STRING_CASE(GL_RGBA8_SNORM)
-		ENUM_TO_STRING_CASE(GL_R16_SNORM)
-		ENUM_TO_STRING_CASE(GL_RG16_SNORM)
-		ENUM_TO_STRING_CASE(GL_RGB16_SNORM)
-		ENUM_TO_STRING_CASE(GL_RGBA16_SNORM)
-		ENUM_TO_STRING_CASE(GL_RGB4)
-		ENUM_TO_STRING_CASE(GL_RGB5)
-		ENUM_TO_STRING_CASE(GL_RGB8)
-		ENUM_TO_STRING_CASE(GL_RGB10)
-		ENUM_TO_STRING_CASE(GL_RGB12)
-		ENUM_TO_STRING_CASE(GL_RGB16)
-		ENUM_TO_STRING_CASE(GL_RGBA2)
-		ENUM_TO_STRING_CASE(GL_RGBA4)
-		ENUM_TO_STRING_CASE(GL_RGB5_A1)
-		ENUM_TO_STRING_CASE(GL_RGBA8)
-		ENUM_TO_STRING_CASE(GL_RGB10_A2)
-		ENUM_TO_STRING_CASE(GL_RGBA12)
-		ENUM_TO_STRING_CASE(GL_RGBA16)
-		ENUM_TO_STRING_CASE(GL_RGBA32F)
-		ENUM_TO_STRING_CASE(GL_RGB32F)
-		ENUM_TO_STRING_CASE(GL_RGBA16F)
-		ENUM_TO_STRING_CASE(GL_RGB16F)
-		ENUM_TO_STRING_CASE(GL_RGBA32UI)
-		ENUM_TO_STRING_CASE(GL_RGB32UI)
-		ENUM_TO_STRING_CASE(GL_RGBA16UI)
-		ENUM_TO_STRING_CASE(GL_RGB16UI)
-		ENUM_TO_STRING_CASE(GL_RGBA8UI)
-		ENUM_TO_STRING_CASE(GL_RGB8UI)
-		ENUM_TO_STRING_CASE(GL_RGBA32I)
-		ENUM_TO_STRING_CASE(GL_RGB32I)
-		ENUM_TO_STRING_CASE(GL_RGBA16I)
-		ENUM_TO_STRING_CASE(GL_RGB16I)
-		ENUM_TO_STRING_CASE(GL_RGBA8I)
-		ENUM_TO_STRING_CASE(GL_RGB8I)
-		ENUM_TO_STRING_CASE(GL_RGB10_A2UI)
-		ENUM_TO_STRING_CASE(GL_SRGB)
-		ENUM_TO_STRING_CASE(GL_SRGB8)
-		ENUM_TO_STRING_CASE(GL_SRGB_ALPHA)
-		ENUM_TO_STRING_CASE(GL_SRGB8_ALPHA8)
-		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT16)
-		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT24)
-		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT32)
-		ENUM_TO_STRING_CASE(GL_DEPTH24_STENCIL8)
-		ENUM_TO_STRING_CASE(GL_R11F_G11F_B10F)
-		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT32F)
-		ENUM_TO_STRING_CASE(GL_DEPTH32F_STENCIL8)
-
-#elif ANDROID_ENABLED
+#ifdef ANDROID_ENABLED
 		// using definitions from GLES3/gl3.h
 
 		ENUM_TO_STRING_CASE(GL_RGBA4)
@@ -418,44 +358,56 @@ String OpenXROpenGLExtension::get_swapchain_format_name(int64_t p_swapchain_form
 		ENUM_TO_STRING_CASE(GL_DEPTH24_STENCIL8)
 
 #else
-		// using definitions from GL/gl.h
-		ENUM_TO_STRING_CASE(GL_ALPHA4_EXT)
-		ENUM_TO_STRING_CASE(GL_ALPHA8_EXT)
-		ENUM_TO_STRING_CASE(GL_ALPHA12_EXT)
-		ENUM_TO_STRING_CASE(GL_ALPHA16_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE4_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE8_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE12_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE16_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE4_ALPHA4_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE6_ALPHA2_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE8_ALPHA8_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE12_ALPHA4_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE12_ALPHA12_EXT)
-		ENUM_TO_STRING_CASE(GL_LUMINANCE16_ALPHA16_EXT)
-		ENUM_TO_STRING_CASE(GL_INTENSITY_EXT)
-		ENUM_TO_STRING_CASE(GL_INTENSITY4_EXT)
-		ENUM_TO_STRING_CASE(GL_INTENSITY8_EXT)
-		ENUM_TO_STRING_CASE(GL_INTENSITY12_EXT)
-		ENUM_TO_STRING_CASE(GL_INTENSITY16_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB2_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB4_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB5_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB8_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB10_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB12_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB16_EXT)
-		ENUM_TO_STRING_CASE(GL_RGBA2_EXT)
-		ENUM_TO_STRING_CASE(GL_RGBA4_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB5_A1_EXT)
-		ENUM_TO_STRING_CASE(GL_RGBA8_EXT)
-		ENUM_TO_STRING_CASE(GL_RGB10_A2_EXT)
-		ENUM_TO_STRING_CASE(GL_RGBA12_EXT)
-		ENUM_TO_STRING_CASE(GL_RGBA16_EXT)
-		ENUM_TO_STRING_CASE(GL_SRGB_EXT)
-		ENUM_TO_STRING_CASE(GL_SRGB8_EXT)
-		ENUM_TO_STRING_CASE(GL_SRGB_ALPHA_EXT)
-		ENUM_TO_STRING_CASE(GL_SRGB8_ALPHA8_EXT)
+		// using definitions from GLAD
+		ENUM_TO_STRING_CASE(GL_R8_SNORM)
+		ENUM_TO_STRING_CASE(GL_RG8_SNORM)
+		ENUM_TO_STRING_CASE(GL_RGB8_SNORM)
+		ENUM_TO_STRING_CASE(GL_RGBA8_SNORM)
+		ENUM_TO_STRING_CASE(GL_R16_SNORM)
+		ENUM_TO_STRING_CASE(GL_RG16_SNORM)
+		ENUM_TO_STRING_CASE(GL_RGB16_SNORM)
+		ENUM_TO_STRING_CASE(GL_RGBA16_SNORM)
+		ENUM_TO_STRING_CASE(GL_RGB4)
+		ENUM_TO_STRING_CASE(GL_RGB5)
+		ENUM_TO_STRING_CASE(GL_RGB8)
+		ENUM_TO_STRING_CASE(GL_RGB10)
+		ENUM_TO_STRING_CASE(GL_RGB12)
+		ENUM_TO_STRING_CASE(GL_RGB16)
+		ENUM_TO_STRING_CASE(GL_RGBA2)
+		ENUM_TO_STRING_CASE(GL_RGBA4)
+		ENUM_TO_STRING_CASE(GL_RGB5_A1)
+		ENUM_TO_STRING_CASE(GL_RGBA8)
+		ENUM_TO_STRING_CASE(GL_RGB10_A2)
+		ENUM_TO_STRING_CASE(GL_RGBA12)
+		ENUM_TO_STRING_CASE(GL_RGBA16)
+		ENUM_TO_STRING_CASE(GL_RGBA32F)
+		ENUM_TO_STRING_CASE(GL_RGB32F)
+		ENUM_TO_STRING_CASE(GL_RGBA16F)
+		ENUM_TO_STRING_CASE(GL_RGB16F)
+		ENUM_TO_STRING_CASE(GL_RGBA32UI)
+		ENUM_TO_STRING_CASE(GL_RGB32UI)
+		ENUM_TO_STRING_CASE(GL_RGBA16UI)
+		ENUM_TO_STRING_CASE(GL_RGB16UI)
+		ENUM_TO_STRING_CASE(GL_RGBA8UI)
+		ENUM_TO_STRING_CASE(GL_RGB8UI)
+		ENUM_TO_STRING_CASE(GL_RGBA32I)
+		ENUM_TO_STRING_CASE(GL_RGB32I)
+		ENUM_TO_STRING_CASE(GL_RGBA16I)
+		ENUM_TO_STRING_CASE(GL_RGB16I)
+		ENUM_TO_STRING_CASE(GL_RGBA8I)
+		ENUM_TO_STRING_CASE(GL_RGB8I)
+		ENUM_TO_STRING_CASE(GL_RGB10_A2UI)
+		ENUM_TO_STRING_CASE(GL_SRGB)
+		ENUM_TO_STRING_CASE(GL_SRGB8)
+		ENUM_TO_STRING_CASE(GL_SRGB_ALPHA)
+		ENUM_TO_STRING_CASE(GL_SRGB8_ALPHA8)
+		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT16)
+		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT24)
+		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT32)
+		ENUM_TO_STRING_CASE(GL_DEPTH24_STENCIL8)
+		ENUM_TO_STRING_CASE(GL_R11F_G11F_B10F)
+		ENUM_TO_STRING_CASE(GL_DEPTH_COMPONENT32F)
+		ENUM_TO_STRING_CASE(GL_DEPTH32F_STENCIL8)
 #endif
 		default: {
 			return String("Swapchain format 0x") + String::num_int64(p_swapchain_format, 16);
diff --git a/modules/openxr/extensions/openxr_opengl_extension.h b/modules/openxr/extensions/openxr_opengl_extension.h
index b666653c8e..473c5157c0 100644
--- a/modules/openxr/extensions/openxr_opengl_extension.h
+++ b/modules/openxr/extensions/openxr_opengl_extension.h
@@ -59,9 +59,8 @@
 #include OPENGL_INCLUDE_H
 #define GL_GLEXT_PROTOTYPES 1
 #define GL3_PROTOTYPES 1
-#include <GL/gl.h>
-#include <GL/glext.h>
-#include <GL/glx.h>
+#include "thirdparty/glad/glad/gl.h"
+#include "thirdparty/glad/glad/glx.h"
 #include <X11/Xlib.h>
 #endif
 
diff --git a/modules/openxr/openxr_api.cpp b/modules/openxr/openxr_api.cpp
index b7c95415d0..d6580ebfa6 100644
--- a/modules/openxr/openxr_api.cpp
+++ b/modules/openxr/openxr_api.cpp
@@ -64,9 +64,8 @@
 #include OPENGL_INCLUDE_H
 #define GL_GLEXT_PROTOTYPES 1
 #define GL3_PROTOTYPES 1
-#include <GL/gl.h>
-#include <GL/glext.h>
-#include <GL/glx.h>
+#include "thirdparty/glad/glad/gl.h"
+#include "thirdparty/glad/glad/glx.h"
 #include <X11/Xlib.h>
 #endif // X11_ENABLED
 #endif // GLES_ENABLED
diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py
index e31d88b741..527e02f855 100644
--- a/modules/raycast/godot_update_embree.py
+++ b/modules/raycast/godot_update_embree.py
@@ -1,5 +1,7 @@
 import glob, os, shutil, subprocess, re
 
+git_tag = "v3.13.5"
+
 include_dirs = [
     "common/tasking",
     "kernels/bvh",
@@ -12,6 +14,7 @@ include_dirs = [
     "common/lexers",
     "common/simd",
     "common/simd/arm",
+    "common/simd/wasm",
     "include/embree3",
     "kernels/subdiv",
     "kernels/geometry",
@@ -76,6 +79,7 @@ if os.path.exists(dir_name):
 
 subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"])
 os.chdir("embree-tmp")
+subprocess.run(["git", "checkout", git_tag])
 
 commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
 
@@ -94,8 +98,7 @@ for f in all_files:
 
 with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
     hash_file.write(
-        f"""
-// Copyright 2009-2020 Intel Corporation
+        f"""// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #define RTC_HASH "{commit_hash}"
@@ -104,8 +107,7 @@ with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
 
 with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
     config_file.write(
-        """
-// Copyright 2009-2020 Intel Corporation
+        """// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 /* #undef EMBREE_RAY_MASK */
@@ -126,6 +128,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
 /* #undef EMBREE_COMPACT_POLYS */
 
 #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
 
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
   #define IF_ENABLED_TRIS(x) x
@@ -192,8 +195,7 @@ with open("CMakeLists.txt", "r") as cmake_file:
 
 with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file:
     config_file.write(
-        f"""
-// Copyright 2009-2021 Intel Corporation
+        f"""// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -209,14 +211,16 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con
 #define EMBREE_MIN_WIDTH 0
 #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
 
-#define EMBREE_STATIC_LIB
-/* #undef EMBREE_API_NAMESPACE */
+#if !defined(EMBREE_STATIC_LIB)
+#   define EMBREE_STATIC_LIB
+#endif
+/* #undef EMBREE_API_NAMESPACE*/
 
 #if defined(EMBREE_API_NAMESPACE)
 #  define RTC_NAMESPACE
-#  define RTC_NAMESPACE_BEGIN namespace  {{
+#  define RTC_NAMESPACE_BEGIN namespace {{
 #  define RTC_NAMESPACE_END }}
-#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_NAMESPACE_USE using namespace;
 #  define RTC_API_EXTERN_C
 #  undef EMBREE_API_NAMESPACE
 #else
diff --git a/scene/2d/physics_body_2d.cpp b/scene/2d/physics_body_2d.cpp
index 7f46ec4c1e..7f5b5d1ea4 100644
--- a/scene/2d/physics_body_2d.cpp
+++ b/scene/2d/physics_body_2d.cpp
@@ -1761,7 +1761,7 @@ void CharacterBody2D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "floor_max_angle", PROPERTY_HINT_RANGE, "0,180,0.1,radians"), "set_floor_max_angle", "get_floor_max_angle");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "floor_snap_length", PROPERTY_HINT_RANGE, "0,32,0.1,or_greater,suffix:px"), "set_floor_snap_length", "get_floor_snap_length");
 
-	ADD_GROUP("Moving Platform", "platform");
+	ADD_GROUP("Moving Platform", "platform_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "platform_on_leave", PROPERTY_HINT_ENUM, "Add Velocity,Add Upward Velocity,Do Nothing", PROPERTY_USAGE_DEFAULT), "set_platform_on_leave", "get_platform_on_leave");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "platform_floor_layers", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_platform_floor_layers", "get_platform_floor_layers");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "platform_wall_layers", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_platform_wall_layers", "get_platform_wall_layers");
diff --git a/scene/2d/polygon_2d.cpp b/scene/2d/polygon_2d.cpp
index 2c825e8f7b..e41664b006 100644
--- a/scene/2d/polygon_2d.cpp
+++ b/scene/2d/polygon_2d.cpp
@@ -663,5 +663,7 @@ Polygon2D::Polygon2D() {
 }
 
 Polygon2D::~Polygon2D() {
+	// This will free the internally-allocated mesh instance, if allocated.
+	RS::get_singleton()->canvas_item_attach_skeleton(get_canvas_item(), RID());
 	RS::get_singleton()->free(mesh);
 }
diff --git a/scene/2d/tile_map.cpp b/scene/2d/tile_map.cpp
index 40e8818262..0159e9f313 100644
--- a/scene/2d/tile_map.cpp
+++ b/scene/2d/tile_map.cpp
@@ -1859,11 +1859,13 @@ void TileMap::_scenes_update_dirty_quadrants(SelfList<TileMapQuadrant>::List &r_
 	while (q_list_element) {
 		TileMapQuadrant &q = *q_list_element->self();
 
-		// Clear the scenes.
-		for (const KeyValue<Vector2i, String> &E : q.scenes) {
-			Node *node = get_node_or_null(E.value);
-			if (node) {
-				node->queue_free();
+		// Clear the scenes if instance cache was cleared.
+		if (instantiated_scenes.is_empty()) {
+			for (const KeyValue<Vector2i, String> &E : q.scenes) {
+				Node *node = get_node_or_null(E.value);
+				if (node) {
+					node->queue_free();
+				}
 			}
 		}
 
@@ -1871,6 +1873,15 @@ void TileMap::_scenes_update_dirty_quadrants(SelfList<TileMapQuadrant>::List &r_
 
 		// Recreate the scenes.
 		for (const Vector2i &E_cell : q.cells) {
+			Vector3i cell_coords = Vector3i(q.layer, E_cell.x, E_cell.y);
+			if (instantiated_scenes.has(cell_coords)) {
+				// Skip scene if the instance was cached (to avoid recreating scenes unnecessarily).
+				continue;
+			}
+			if (!Engine::get_singleton()->is_editor_hint()) {
+				instantiated_scenes.insert(cell_coords);
+			}
+
 			const TileMapCell &c = get_cell(q.layer, E_cell, true);
 
 			TileSetSource *source;
@@ -1907,15 +1918,16 @@ void TileMap::_scenes_update_dirty_quadrants(SelfList<TileMapQuadrant>::List &r_
 }
 
 void TileMap::_scenes_cleanup_quadrant(TileMapQuadrant *p_quadrant) {
-	// Clear the scenes.
-	for (const KeyValue<Vector2i, String> &E : p_quadrant->scenes) {
-		Node *node = get_node_or_null(E.value);
-		if (node) {
-			node->queue_free();
+	// Clear the scenes if instance cache was cleared.
+	if (instantiated_scenes.is_empty()) {
+		for (const KeyValue<Vector2i, String> &E : p_quadrant->scenes) {
+			Node *node = get_node_or_null(E.value);
+			if (node) {
+				node->queue_free();
+			}
 		}
+		p_quadrant->scenes.clear();
 	}
-
-	p_quadrant->scenes.clear();
 }
 
 void TileMap::_scenes_draw_quadrant_debug(TileMapQuadrant *p_quadrant) {
@@ -4037,6 +4049,7 @@ void TileMap::_bind_methods() {
 void TileMap::_tile_set_changed() {
 	emit_signal(SNAME("changed"));
 	_tile_set_changed_deferred_update_needed = true;
+	instantiated_scenes.clear();
 	call_deferred(SNAME("_tile_set_changed_deferred_update"));
 }
 
diff --git a/scene/2d/tile_map.h b/scene/2d/tile_map.h
index eaf100631e..68a5d3c80b 100644
--- a/scene/2d/tile_map.h
+++ b/scene/2d/tile_map.h
@@ -236,6 +236,8 @@ private:
 	void _clear_layer_internals(int p_layer);
 	void _clear_internals();
 
+	HashSet<Vector3i> instantiated_scenes;
+
 	// Rect caching.
 	void _recompute_rect_cache();
 
diff --git a/scene/gui/dialogs.cpp b/scene/gui/dialogs.cpp
index bf4dd3d245..0d265719ec 100644
--- a/scene/gui/dialogs.cpp
+++ b/scene/gui/dialogs.cpp
@@ -377,7 +377,7 @@ void AcceptDialog::_bind_methods() {
 
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "ok_button_text"), "set_ok_button_text", "get_ok_button_text");
 
-	ADD_GROUP("Dialog", "dialog");
+	ADD_GROUP("Dialog", "dialog_");
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "dialog_text", PROPERTY_HINT_MULTILINE_TEXT, "", PROPERTY_USAGE_DEFAULT_INTL), "set_text", "get_text");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "dialog_hide_on_ok"), "set_hide_on_ok", "get_hide_on_ok");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "dialog_close_on_escape"), "set_close_on_escape", "get_close_on_escape");
diff --git a/scene/gui/graph_edit.cpp b/scene/gui/graph_edit.cpp
index 46b712379d..40792dd43f 100644
--- a/scene/gui/graph_edit.cpp
+++ b/scene/gui/graph_edit.cpp
@@ -2406,7 +2406,7 @@ void GraphEdit::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "zoom_step"), "set_zoom_step", "get_zoom_step");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "show_zoom_label"), "set_show_zoom_label", "is_showing_zoom_label");
 
-	ADD_GROUP("Minimap", "minimap");
+	ADD_GROUP("Minimap", "minimap_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "minimap_enabled"), "set_minimap_enabled", "is_minimap_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::VECTOR2, "minimap_size", PROPERTY_HINT_NONE, "suffix:px"), "set_minimap_size", "get_minimap_size");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "minimap_opacity"), "set_minimap_opacity", "get_minimap_opacity");
diff --git a/scene/main/node.cpp b/scene/main/node.cpp
index 07dce9fbec..90b996e06b 100644
--- a/scene/main/node.cpp
+++ b/scene/main/node.cpp
@@ -2570,10 +2570,6 @@ static void _Node_debug_sn(Object *p_obj) {
 }
 #endif // DEBUG_ENABLED
 
-void Node::_print_orphan_nodes() {
-	print_orphan_nodes();
-}
-
 void Node::print_orphan_nodes() {
 #ifdef DEBUG_ENABLED
 	ObjectDB::debug_objects(_Node_debug_sn);
@@ -2745,6 +2741,7 @@ void Node::_bind_methods() {
 	GLOBAL_DEF("editor/node_naming/name_casing", NAME_CASING_PASCAL_CASE);
 	ProjectSettings::get_singleton()->set_custom_property_info("editor/node_naming/name_casing", PropertyInfo(Variant::INT, "editor/node_naming/name_casing", PROPERTY_HINT_ENUM, "PascalCase,camelCase,snake_case"));
 
+	ClassDB::bind_static_method("Node", D_METHOD("print_orphan_nodes"), &Node::print_orphan_nodes);
 	ClassDB::bind_method(D_METHOD("add_sibling", "sibling", "force_readable_name"), &Node::add_sibling, DEFVAL(false));
 
 	ClassDB::bind_method(D_METHOD("set_name", "name"), &Node::set_name);
@@ -2802,7 +2799,6 @@ void Node::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_process_mode", "mode"), &Node::set_process_mode);
 	ClassDB::bind_method(D_METHOD("get_process_mode"), &Node::get_process_mode);
 	ClassDB::bind_method(D_METHOD("can_process"), &Node::can_process);
-	ClassDB::bind_method(D_METHOD("print_orphan_nodes"), &Node::_print_orphan_nodes);
 
 	ClassDB::bind_method(D_METHOD("set_display_folded", "fold"), &Node::set_display_folded);
 	ClassDB::bind_method(D_METHOD("is_displayed_folded"), &Node::is_displayed_folded);
diff --git a/scene/main/node.h b/scene/main/node.h
index 5e51ec6b0e..e07fb003d4 100644
--- a/scene/main/node.h
+++ b/scene/main/node.h
@@ -173,7 +173,6 @@ private:
 	void _propagate_ready();
 	void _propagate_exit_tree();
 	void _propagate_after_exit_tree();
-	void _print_orphan_nodes();
 	void _propagate_process_owner(Node *p_owner, int p_pause_notification, int p_enabled_notification);
 	void _propagate_groups_dirty();
 	Array _get_node_and_resource(const NodePath &p_path);
diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp
index 345d5de937..f58bf868ad 100644
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@@ -492,6 +492,7 @@ void Viewport::_notification(int p_what) {
 		} break;
 
 		case NOTIFICATION_WM_WINDOW_FOCUS_OUT: {
+			_gui_cancel_tooltip();
 			_drop_physics_mouseover();
 			if (gui.mouse_focus && !gui.forced_mouse_focus) {
 				_drop_mouse_focus();
diff --git a/scene/resources/camera_attributes.cpp b/scene/resources/camera_attributes.cpp
index 3c322f32b6..8e4876e01f 100644
--- a/scene/resources/camera_attributes.cpp
+++ b/scene/resources/camera_attributes.cpp
@@ -120,7 +120,7 @@ void CameraAttributes::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_auto_exposure_scale", "exposure_grey"), &CameraAttributes::set_auto_exposure_scale);
 	ClassDB::bind_method(D_METHOD("get_auto_exposure_scale"), &CameraAttributes::get_auto_exposure_scale);
 
-	ADD_GROUP("Exposure", "exposure");
+	ADD_GROUP("Exposure", "exposure_");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "exposure_sensitivity", PROPERTY_HINT_RANGE, "0.1,32000.0,0.1,suffix:ISO"), "set_exposure_sensitivity", "get_exposure_sensitivity");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "exposure_multiplier", PROPERTY_HINT_RANGE, "0.0,2048.0,0.001"), "set_exposure_multiplier", "get_exposure_multiplier");
 
@@ -472,7 +472,7 @@ void CameraAttributesPhysical::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "frustum_near", PROPERTY_HINT_RANGE, "0.001,10,0.001,or_greater,exp,suffix:m"), "set_near", "get_near");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "frustum_far", PROPERTY_HINT_RANGE, "0.01,4000,0.01,or_greater,exp,suffix:m"), "set_far", "get_far");
 
-	ADD_GROUP("Exposure", "exposure");
+	ADD_GROUP("Exposure", "exposure_");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "exposure_aperture", PROPERTY_HINT_RANGE, "0.5,64.0,0.01,exp,suffix:f-stop"), "set_aperture", "get_aperture");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "exposure_shutter_speed", PROPERTY_HINT_RANGE, "0.1,8000.0,0.001,suffix:1/s"), "set_shutter_speed", "get_shutter_speed");
 
diff --git a/scene/resources/font.cpp b/scene/resources/font.cpp
index 84814d939b..13080d6948 100644
--- a/scene/resources/font.cpp
+++ b/scene/resources/font.cpp
@@ -2605,16 +2605,16 @@ void FontVariation::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "base_font", PROPERTY_HINT_RESOURCE_TYPE, "Font"), "set_base_font", "get_base_font");
 	ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "fallbacks", PROPERTY_HINT_ARRAY_TYPE, vformat("%s/%s:%s", Variant::OBJECT, PROPERTY_HINT_RESOURCE_TYPE, "Font")), "set_fallbacks", "get_fallbacks");
 
-	ADD_GROUP("Variation", "variation");
+	ADD_GROUP("Variation", "variation_");
 	ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "variation_opentype"), "set_variation_opentype", "get_variation_opentype");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "variation_face_index"), "set_variation_face_index", "get_variation_face_index");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "variation_embolden", PROPERTY_HINT_RANGE, "-2,2,0.01"), "set_variation_embolden", "get_variation_embolden");
 	ADD_PROPERTY(PropertyInfo(Variant::TRANSFORM2D, "variation_transform", PROPERTY_HINT_NONE, "suffix:px"), "set_variation_transform", "get_variation_transform");
 
-	ADD_GROUP("OpenType Features", "opentype");
+	ADD_GROUP("OpenType Features", "opentype_");
 	ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "opentype_features"), "set_opentype_features", "get_opentype_features");
 
-	ADD_GROUP("Extra Spacing", "spacing");
+	ADD_GROUP("Extra Spacing", "spacing_");
 	ADD_PROPERTYI(PropertyInfo(Variant::INT, "spacing_glyph", PROPERTY_HINT_NONE, "suffix:px"), "set_spacing", "get_spacing", TextServer::SPACING_GLYPH);
 	ADD_PROPERTYI(PropertyInfo(Variant::INT, "spacing_space", PROPERTY_HINT_NONE, "suffix:px"), "set_spacing", "get_spacing", TextServer::SPACING_SPACE);
 	ADD_PROPERTYI(PropertyInfo(Variant::INT, "spacing_top", PROPERTY_HINT_NONE, "suffix:px"), "set_spacing", "get_spacing", TextServer::SPACING_TOP);
diff --git a/scene/resources/label_settings.cpp b/scene/resources/label_settings.cpp
index ef380a68f9..c49620ce27 100644
--- a/scene/resources/label_settings.cpp
+++ b/scene/resources/label_settings.cpp
@@ -66,16 +66,16 @@ void LabelSettings::_bind_methods() {
 
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "line_spacing", PROPERTY_HINT_NONE, "suffix:px"), "set_line_spacing", "get_line_spacing");
 
-	ADD_GROUP("Font", "font");
+	ADD_GROUP("Font", "font_");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "font", PROPERTY_HINT_RESOURCE_TYPE, "Font"), "set_font", "get_font");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "font_size", PROPERTY_HINT_RANGE, "1,1024,1,or_greater,suffix:px"), "set_font_size", "get_font_size");
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "font_color"), "set_font_color", "get_font_color");
 
-	ADD_GROUP("Outline", "outline");
+	ADD_GROUP("Outline", "outline_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "outline_size", PROPERTY_HINT_RANGE, "0,127,1,or_greater,suffix:px"), "set_outline_size", "get_outline_size");
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "outline_color"), "set_outline_color", "get_outline_color");
 
-	ADD_GROUP("Shadow", "shadow");
+	ADD_GROUP("Shadow", "shadow_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "shadow_size", PROPERTY_HINT_RANGE, "0,127,1,or_greater,suffix:px"), "set_shadow_size", "get_shadow_size");
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "shadow_color"), "set_shadow_color", "get_shadow_color");
 	ADD_PROPERTY(PropertyInfo(Variant::VECTOR2, "shadow_offset", PROPERTY_HINT_NONE, "suffix:px"), "set_shadow_offset", "get_shadow_offset");
diff --git a/scene/resources/primitive_meshes.cpp b/scene/resources/primitive_meshes.cpp
index 4c6d533c72..2e8fcb3717 100644
--- a/scene/resources/primitive_meshes.cpp
+++ b/scene/resources/primitive_meshes.cpp
@@ -2779,13 +2779,7 @@ void TextMesh::_generate_glyph_mesh_data(const GlyphMeshKey &p_key, const Glyph
 				real_t step = CLAMP(curve_step / (p0 - p3).length(), 0.01, 0.5);
 				real_t t = step;
 				while (t < 1.0) {
-					real_t omt = (1.0 - t);
-					real_t omt2 = omt * omt;
-					real_t omt3 = omt2 * omt;
-					real_t t2 = t * t;
-					real_t t3 = t2 * t;
-
-					Vector2 point = p0 * omt3 + p1 * omt2 * t * 3.0 + p2 * omt * t2 * 3.0 + p3 * t3;
+					Vector2 point = p0.bezier_interpolate(p1, p2, p3, t);
 					Vector2 p = point * pixel_size + origin;
 					polygon.push_back(ContourPoint(p, false));
 					t += step;
diff --git a/tests/core/math/test_vector2i.h b/tests/core/math/test_vector2i.h
index c7a0dccdcc..9ee844ffa8 100644
--- a/tests/core/math/test_vector2i.h
+++ b/tests/core/math/test_vector2i.h
@@ -131,12 +131,16 @@ TEST_CASE("[Vector2i] Other methods") {
 			"Vector2i aspect should work as expected.");
 
 	CHECK_MESSAGE(
-			Vector2i(1, 2) == vector.min(Vector2i(3, 2)),
+			vector.min(Vector2i(3, 2)) == Vector2i(1, 2),
 			"Vector2i min should return expected value.");
 
 	CHECK_MESSAGE(
-			Vector2i(5, 3) == vector.max(Vector2i(5, 2)),
+			vector.max(Vector2i(5, 2)) == Vector2i(5, 3),
 			"Vector2i max should return expected value.");
+
+	CHECK_MESSAGE(
+			vector.snapped(Vector2i(4, 2)) == Vector2i(0, 4),
+			"Vector2i snapped should work as expected.");
 }
 
 TEST_CASE("[Vector2i] Abs and sign methods") {
diff --git a/tests/core/math/test_vector3i.h b/tests/core/math/test_vector3i.h
index 56578f99eb..45240bd2ff 100644
--- a/tests/core/math/test_vector3i.h
+++ b/tests/core/math/test_vector3i.h
@@ -127,6 +127,14 @@ TEST_CASE("[Vector3i] Operators") {
 			"Vector3i constructed from Vector3 should work as expected.");
 }
 
+TEST_CASE("[Vector3i] Other methods") {
+	const Vector3i vector = Vector3i(1, 3, -7);
+
+	CHECK_MESSAGE(
+			vector.snapped(Vector3i(4, 2, 5)) == Vector3i(0, 4, -5),
+			"Vector3i snapped should work as expected.");
+}
+
 TEST_CASE("[Vector3i] Abs and sign methods") {
 	const Vector3i vector1 = Vector3i(1, 3, 5);
 	const Vector3i vector2 = Vector3i(1, -3, -5);
diff --git a/tests/core/math/test_vector4i.h b/tests/core/math/test_vector4i.h
index 30d38607dd..8a9522f9cc 100644
--- a/tests/core/math/test_vector4i.h
+++ b/tests/core/math/test_vector4i.h
@@ -130,6 +130,14 @@ TEST_CASE("[Vector4i] Operators") {
 			"Vector4i constructed from Vector4 should work as expected.");
 }
 
+TEST_CASE("[Vector3i] Other methods") {
+	const Vector4i vector = Vector4i(1, 3, -7, 13);
+
+	CHECK_MESSAGE(
+			vector.snapped(Vector4i(4, 2, 5, 8)) == Vector4i(0, 4, -5, 16),
+			"Vector4i snapped should work as expected.");
+}
+
 TEST_CASE("[Vector4i] Abs and sign methods") {
 	const Vector4i vector1 = Vector4i(1, 3, 5, 7);
 	const Vector4i vector2 = Vector4i(1, -3, -5, 7);
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 9a58c928cc..ad290880e6 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -74,7 +74,7 @@ Files extracted from upstream source:
 ## embree
 
 - Upstream: https://github.com/embree/embree
-- Version: 3.13.1 (12b99393438a4cc9e478e33459eed78bec6233fd, 2021)
+- Version: 3.13.5 (698442324ccddd11725fb8875275dc1384f7fb40, 2022)
 - License: Apache 2.0
 
 Files extracted from upstream:
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
index 645681ac63..6d411e4852 100644
--- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -26,7 +26,6 @@ namespace embree
         abort();
         // -- GODOT end --
     }
-    
 #elif defined(TASKING_TBB)
   #if TBB_INTERFACE_VERSION >= 12002
     tbb::task_group_context context;
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h
index 92c37a4a38..7838ef11b3 100644
--- a/thirdparty/embree/common/algorithms/parallel_for_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -30,15 +30,20 @@ namespace embree
     template<typename ArrayArray>
       __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
       init(array2,minStepSize);
+    }
+
+    template<typename SizeFunc>
+    __forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
+      init(numArrays,getSize,minStepSize);
     } 
 
-    template<typename ArrayArray>
-      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    template<typename SizeFunc>
+    __forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
     {
       /* first calculate total number of elements */
       size_t N = 0;
-      for (size_t i=0; i<array2.size(); i++) {
-	N += array2[i] ? array2[i]->size() : 0;
+      for (size_t i=0; i<numArrays; i++) {
+	N += getSize(i);
       }
       this->N = N;
 
@@ -54,8 +59,8 @@ namespace embree
       size_t k0 = (++taskIndex)*N/taskCount;
       for (size_t i=0, k=0; taskIndex < taskCount; i++) 
       {
-	assert(i<array2.size());
-	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	assert(i<numArrays);
+	size_t j=0, M = getSize(i);
 	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
 	  assert(taskIndex<taskCount);
 	  i0[taskIndex] = i;
@@ -67,6 +72,12 @@ namespace embree
       }
     }
 
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
+    }
+    
     __forceinline size_t size() const {
       return N;
     }
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
index b15b44a991..8c3f4aace7 100644
--- a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -17,15 +17,20 @@ namespace embree
       __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
       : ParallelForForState(array2,minStepSize) {}
 
+    template<typename SizeFunc>
+    __forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize)
+      : ParallelForForState(numArrays,getSize,minStepSize) {}
+
     ParallelPrefixSumState<Value> prefix_state;
   };
   
-  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
-                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, 
+                                                       const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction)
   {
     /* calculate number of tasks to use */
     const size_t taskCount = state.taskCount;
+    
     /* perform parallel prefix sum */
     parallel_for(taskCount, [&](const size_t taskIndex)
     {
@@ -38,9 +43,9 @@ namespace embree
       size_t k=k0;
       Value N=identity;
       for (size_t i=i0; k<k1; i++) {
-	const size_t size = array2[i] ? array2[i]->size() : 0;
+	const size_t size = getSize(i);
         const size_t r0 = j0, r1 = min(size,r0+k1-k);
-        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k));
         k+=r1-r0; j0 = 0;
       }
       state.prefix_state.counts[taskIndex] = N;
@@ -58,9 +63,10 @@ namespace embree
     return sum;
   }
 
-  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
-                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, 
+                                                       const SizeFunc& getSize, 
+                                                       const Value& identity, const Func& func, const Reduction& reduction)
   {
     /* calculate number of tasks to use */
     const size_t taskCount = state.taskCount;
@@ -76,9 +82,9 @@ namespace embree
       size_t k=k0;
       Value N=identity;
       for (size_t i=i0; k<k1; i++) {
-	const size_t size = array2[i] ? array2[i]->size() : 0;
+	const size_t size = getSize(i);
         const size_t r0 = j0, r1 = min(size,r0+k1-k);
-        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N)));
         k+=r1-r0; j0 = 0;
       }
       state.prefix_state.counts[taskIndex] = N;
@@ -96,6 +102,30 @@ namespace embree
     return sum;
   }
 
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+  __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state,
+                                                    ArrayArray& array2, Index minStepSize, 
+                                                    const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0_(state,minStepSize,
+                                        [&](Index i) { return array2[i] ? array2[i]->size() : 0; },
+                                        identity,
+                                        [&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); },
+                                        reduction);
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+  __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state,
+                                                    ArrayArray& array2, Index minStepSize, 
+                                                    const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1_(state,minStepSize,
+                                        [&](Index i) { return array2[i] ? array2[i]->size() : 0; },
+                                        identity,
+                                        [&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); },
+                                        reduction);
+  }                                       
+
   template<typename ArrayArray, typename Value, typename Func, typename Reduction>
     __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
 						     const Value& identity, const Func& func, const Reduction& reduction)
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
index 8271372ea4..cd0078f2e6 100644
--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -26,7 +26,7 @@ namespace embree
     const Index threadCount = (Index) TaskScheduler::threadCount();
     taskCount = min(taskCount,threadCount,maxTasks);
 
-    /* parallel invokation of all tasks */
+    /* parallel invocation of all tasks */
     dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
     parallel_for(taskCount, [&](const Index taskIndex) {
         const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
index bc43155358..e4eb3df9a4 100644
--- a/thirdparty/embree/common/math/bbox.h
+++ b/thirdparty/embree/common/math/bbox.h
@@ -77,7 +77,7 @@ namespace embree
     return lower > upper;
   }
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline bool BBox<Vec3fa>::empty() const {
     return !all(le_mask(lower,upper));
   }
@@ -196,11 +196,11 @@ namespace embree
   }
 
   template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
-    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+    return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
   }
 
   template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
-    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+    return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
   }
   
   /*! blending */
@@ -228,11 +228,11 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined (__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined (__AVX__)
 #include "../simd/avx.h"
 #endif
 
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
index 529584ea16..e62e4ad2a4 100644
--- a/thirdparty/embree/common/math/color.h
+++ b/thirdparty/embree/common/math/color.h
@@ -152,21 +152,38 @@ namespace embree
   }
   __forceinline const Color rcp  ( const Color& a )
   {
+#if defined(__aarch64__)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
 #if defined(__AVX512VL__)
     const Color r = _mm_rcp14_ps(a.m128);
 #else
     const Color r = _mm_rcp_ps(a.m128);
 #endif
-    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r))));   // computes r + r * (1 - a * r)
+
+#endif  //defined(__aarch64__)
   }
   __forceinline const Color rsqrt( const Color& a )
   {
+#if defined(__aarch64__)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif  //defined(__aarch64__)
   }
   __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
 
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
index 03919ae20c..f51c642bfc 100644
--- a/thirdparty/embree/common/math/constants.cpp
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -5,23 +5,4 @@
 
 namespace embree
 {
-  TrueTy True;
-  FalseTy False;
-  ZeroTy zero;
-  OneTy one;
-  NegInfTy neg_inf;
-  PosInfTy inf;
-  PosInfTy pos_inf;
-  NaNTy nan;
-  UlpTy ulp;
-  PiTy pi;
-  OneOverPiTy one_over_pi;
-  TwoPiTy two_pi;
-  OneOverTwoPiTy one_over_two_pi;
-  FourPiTy four_pi;
-  OneOverFourPiTy one_over_four_pi;
-  StepTy step;
-  ReverseStepTy reverse_step;
-  EmptyTy empty;
-  UndefinedTy undefined;
 }
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
index 578473a8ab..07a1a868ba 100644
--- a/thirdparty/embree/common/math/constants.h
+++ b/thirdparty/embree/common/math/constants.h
@@ -24,13 +24,13 @@ namespace embree
     __forceinline operator bool( ) const { return true; }
   };
 
-  extern MAYBE_UNUSED TrueTy True;
+  const constexpr TrueTy True = TrueTy();
 
   struct FalseTy {
     __forceinline operator bool( ) const { return false; }
   };
 
-  extern MAYBE_UNUSED FalseTy False;
+  const constexpr FalseTy False = FalseTy();
   
   struct ZeroTy
   {
@@ -48,7 +48,7 @@ namespace embree
     __forceinline operator unsigned char     ( ) const { return 0; }
   }; 
 
-  extern MAYBE_UNUSED ZeroTy zero;
+  const constexpr ZeroTy zero = ZeroTy();
 
   struct OneTy
   {
@@ -66,7 +66,7 @@ namespace embree
     __forceinline operator unsigned char     ( ) const { return 1; }
   };
 
-  extern MAYBE_UNUSED OneTy one;
+  const constexpr OneTy one = OneTy();
 
   struct NegInfTy
   {
@@ -85,7 +85,7 @@ namespace embree
 
   };
 
-  extern MAYBE_UNUSED NegInfTy neg_inf;
+  const constexpr NegInfTy neg_inf = NegInfTy();
 
   struct PosInfTy
   {
@@ -103,8 +103,8 @@ namespace embree
     __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
   };
 
-  extern MAYBE_UNUSED PosInfTy inf;
-  extern MAYBE_UNUSED PosInfTy pos_inf;
+  const constexpr PosInfTy     inf = PosInfTy();
+  const constexpr PosInfTy pos_inf = PosInfTy();
 
   struct NaNTy
   {
@@ -112,15 +112,15 @@ namespace embree
     __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
   };
 
-  extern MAYBE_UNUSED NaNTy nan;
+  const constexpr NaNTy nan = NaNTy();
 
   struct UlpTy
   {
     __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
     __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
   };
-
-  extern MAYBE_UNUSED UlpTy ulp;
+  
+  const constexpr UlpTy ulp = UlpTy();
 
   struct PiTy
   {
@@ -128,7 +128,7 @@ namespace embree
     __forceinline operator float ( ) const { return float(M_PI); }
   };
 
-  extern MAYBE_UNUSED PiTy pi;
+  const constexpr PiTy pi = PiTy();
 
   struct OneOverPiTy
   {
@@ -136,7 +136,7 @@ namespace embree
     __forceinline operator float ( ) const { return float(M_1_PI); }
   };
 
-  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+  const constexpr OneOverPiTy one_over_pi = OneOverPiTy();
 
   struct TwoPiTy
   {
@@ -144,7 +144,7 @@ namespace embree
     __forceinline operator float ( ) const { return float(2.0*M_PI); }
   };
 
-  extern MAYBE_UNUSED TwoPiTy two_pi;
+  const constexpr TwoPiTy two_pi = TwoPiTy();
 
   struct OneOverTwoPiTy
   {
@@ -152,7 +152,7 @@ namespace embree
     __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
   };
 
-  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+  const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy();
 
   struct FourPiTy
   {
@@ -160,7 +160,7 @@ namespace embree
     __forceinline operator float ( ) const { return float(4.0*M_PI); }
   };
 
-  extern MAYBE_UNUSED FourPiTy four_pi;
+  const constexpr FourPiTy four_pi = FourPiTy();
 
   struct OneOverFourPiTy
   {
@@ -168,30 +168,42 @@ namespace embree
     __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
   };
 
-  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+  const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy();
 
   struct StepTy {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          char     ( ) const { return 0; }
+    __forceinline operator unsigned char     ( ) const { return 0; }
   };
 
-  extern MAYBE_UNUSED StepTy step;
+  const constexpr StepTy step = StepTy();
 
   struct ReverseStepTy {
   };
 
-  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+  const constexpr ReverseStepTy reverse_step = ReverseStepTy();
 
   struct EmptyTy {
   };
 
-  extern MAYBE_UNUSED EmptyTy empty;
+  const constexpr EmptyTy empty = EmptyTy();
 
   struct FullTy {
   };
 
-  extern MAYBE_UNUSED FullTy full;
+  const constexpr FullTy full = FullTy();
 
   struct UndefinedTy {
   };
 
-  extern MAYBE_UNUSED UndefinedTy undefined;
+  const constexpr UndefinedTy undefined = UndefinedTy();
 }
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
index 4bc54c1a6a..7930c17727 100644
--- a/thirdparty/embree/common/math/math.h
+++ b/thirdparty/embree/common/math/math.h
@@ -53,6 +53,16 @@ namespace embree
 
   __forceinline float rcp  ( const float x )
   {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
     const __m128 a = _mm_set_ss(x);
 
 #if defined(__AVX512VL__)
@@ -66,30 +76,71 @@ namespace embree
 #else
     return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
 #endif
+
+#endif  //defined(__aarch64__)
   }
 
   __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
   }
   __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
   }
   __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
   }
   __forceinline float rsqrt( const float x )
   {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
     const __m128 a = _mm_set_ss(x);
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
 #else
     __m128 r = _mm_rsqrt_ss(a);
 #endif
-    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
-#if defined(__ARM_NEON)
-    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    return _mm_cvtss_f32(c);
 #endif
-    return _mm_cvtss_f32(r);
   }
 
 #if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
@@ -146,7 +197,17 @@ namespace embree
   __forceinline double floor( const double x ) { return ::floor (x); }
   __forceinline double ceil ( const double x ) { return ::ceil (x); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
   __forceinline float mini(float a, float b) {
     const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
     const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -155,7 +216,17 @@ namespace embree
   }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
   __forceinline float maxi(float a, float b) {
     const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
     const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -172,9 +243,12 @@ namespace embree
   __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
   __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
   __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
   __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
 #endif
+#if defined(__EMSCRIPTEN__)
+  __forceinline   long   min(long     a, long     b) { return a<b ? a:b; }
+#endif
 
   template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
   template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
@@ -189,9 +263,12 @@ namespace embree
   __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
   __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
   __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
   __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
 #endif
+#if defined(__EMSCRIPTEN__)
+  __forceinline   long   max(long     a, long     b) { return a<b ? b:a; }
+#endif
 
   template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
   template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
@@ -231,6 +308,15 @@ namespace embree
   __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
   __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
   __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+#pragma clang fp contract(on)
+  
 #else
   __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
   __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
@@ -326,7 +412,7 @@ namespace embree
     return x | (y << 1) | (z << 2);
   }
 
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 
   template<>
     __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h
index 080800efcd..78efccda72 100644
--- a/thirdparty/embree/common/math/quaternion.h
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -242,13 +242,17 @@ namespace embree
     T cosTheta = dot(q0, q1_);
     QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
     cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
-    if (unlikely(all(cosTheta > 0.9995f))) {
-      return normalize(lerp(q0, q1, t));
-    }
+
+    // spherical linear interpolation
     const T phi = t * fastapprox::acos(cosTheta);
     T sinPhi, cosPhi;
     fastapprox::sincos(phi, sinPhi, cosPhi);
     QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
-    return msub(cosPhi, q0, qperp);
+    QuaternionT<T> qslerp = msub(cosPhi, q0, qperp);
+
+    // regular linear interpolation as fallback
+    QuaternionT<T> qlerp = normalize(lerp(q0, q1, t));
+
+    return select(cosTheta > 0.9995f, qlerp, qslerp);
   }
 }
diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h
index fd16c26e81..daf9dd96d2 100644
--- a/thirdparty/embree/common/math/transcendental.h
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -27,7 +27,7 @@ __forceinline T sin(const T &v)
   // Reduced range version of x
   auto x = v - kReal * piOverTwoVec;
   auto kMod4 = k & 3;
-  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
   auto flipSign = (kMod4 > 1);
 
   // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
@@ -76,8 +76,8 @@ __forceinline T cos(const T &v)
   auto x = v - kReal * piOverTwoVec;
 
   auto kMod4 = k & 3;
-  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
-  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+  auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
+  auto flipSign = (kMod4 == 1) | (kMod4 == 2);
 
   const float sinC2  = -0.16666667163372039794921875;
   const float sinC4  = +8.333347737789154052734375e-3;
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
index d62aef51f3..f6d98ffa0d 100644
--- a/thirdparty/embree/common/math/vec2.h
+++ b/thirdparty/embree/common/math/vec2.h
@@ -144,7 +144,7 @@ namespace embree
   }
   
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
@@ -205,11 +205,11 @@ namespace embree
 
 #include "vec2fa.h"
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined(__AVX__)
 #include "../simd/avx.h"
 #endif
 
@@ -221,7 +221,7 @@ namespace embree
 {
   template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 #endif
 
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
index a51fb68fd0..4f222894c2 100644
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -97,6 +97,12 @@ namespace embree
 
   __forceinline Vec2fa rcp  ( const Vec2fa& a )
   {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
 #if defined(__AVX512VL__)
     const Vec2fa r = _mm_rcp14_ps(a.m128);
 #else
@@ -104,13 +110,15 @@ namespace embree
 #endif
 
 #if defined(__AVX2__)
-    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+    const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_fmadd_ps(r, h_n, r);            // Then compute r + r * h_n
 #else
-    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
-    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+    const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n));             // Then compute r + r * h_n  
 #endif
 
     return res;
+#endif  //defined(__aarch64__)
   }
 
   __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -118,12 +126,21 @@ namespace embree
 
   __forceinline Vec2fa rsqrt( const Vec2fa& a )
   {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif
   }
 
   __forceinline Vec2fa zero_fix(const Vec2fa& a) {
@@ -156,7 +173,7 @@ namespace embree
   __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -165,7 +182,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -227,7 +244,7 @@ namespace embree
   __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__SSE4_1__)
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
index ce94eff327..254f6c4011 100644
--- a/thirdparty/embree/common/math/vec3.h
+++ b/thirdparty/embree/common/math/vec3.h
@@ -197,7 +197,7 @@ namespace embree
   template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
@@ -207,7 +207,6 @@ namespace embree
   template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
   template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
   template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
-
   template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
   {
     const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
@@ -266,11 +265,11 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined(__AVX__)
 #include "../simd/avx.h"
 #endif
 
@@ -291,14 +290,14 @@ namespace embree
   template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     x = a.x; y = a.y; z = a.z;
   }
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
   template<>
   __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
   }
 #endif
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   template<>
   __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
     return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
index 586039741d..8564cf6d10 100644
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -55,7 +55,13 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+        __m128 t = _mm_load_ps((float*)a);
+        t[3] = 0.0f;
+        return Vec3fa(t);
+#else
       return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
     }
 
     static __forceinline Vec3fa loadu( const void* const a ) {
@@ -89,12 +95,20 @@ namespace embree
 
   __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
   __forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return vnegq_f32(a.m128);
+#else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
     return _mm_xor_ps(a.m128, mask);
+#endif
   }
   __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return _mm_abs_ps(a.m128);
+#else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
     return _mm_and_ps(a.m128, mask);
+#endif
   }
   __forceinline Vec3fa sign ( const Vec3fa& a ) {
     return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
@@ -102,6 +116,10 @@ namespace embree
 
   __forceinline Vec3fa rcp  ( const Vec3fa& a )
   {
+#if defined(__aarch64__)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#else
+
 #if defined(__AVX512VL__)
     const Vec3fa r = _mm_rcp14_ps(a.m128);
 #else
@@ -109,13 +127,15 @@ namespace embree
 #endif
 
 #if defined(__AVX2__)
-    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+    const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128);       // Then compute r + r * h_n
 #else
-    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
-    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+    const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128));        // Then compute r + r * h_n  
 #endif
 
     return res;
+#endif  //defined(__aarch64__)
   }
 
   __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -123,12 +143,20 @@ namespace embree
 
   __forceinline Vec3fa rsqrt( const Vec3fa& a )
   {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
   }
 
   __forceinline Vec3fa zero_fix(const Vec3fa& a) {
@@ -161,7 +189,7 @@ namespace embree
   __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -170,7 +198,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -187,16 +215,16 @@ namespace embree
   /// Ternary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__AVX2__)
+#if defined(__AVX2__) || defined(__ARM_NEON)
   __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
   __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
   __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
   __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
 #else
   __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
-  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
   __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
   __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
 #endif
 
   __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@@ -218,8 +246,26 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline float reduce_add(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+    t[3] = 0.0f;
+    return vaddvq_f32(t);
+  }
 
-  __forceinline float reduce_add(const Vec3fa& v) { 
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vminvq_f32(t);
+  }
+  __forceinline float reduce_max(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vmaxvq_f32(t);
+  }
+#else
+  __forceinline float reduce_add(const Vec3fa& v) {
     const vfloat4 a(v.m128);
     const vfloat4 b = shuffle<1>(a);
     const vfloat4 c = shuffle<2>(a);
@@ -229,6 +275,7 @@ namespace embree
   __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
   __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
   __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators
@@ -241,8 +288,13 @@ namespace embree
   __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
   __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
   __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
-  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+ #if defined(__aarch64__)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
 
   __forceinline bool isvalid ( const Vec3fa& v ) {
     return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@@ -261,7 +313,7 @@ namespace embree
   }
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__SSE4_1__)
@@ -335,7 +387,11 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
   __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
   __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
@@ -393,8 +449,10 @@ namespace embree
 
     __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
     __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
-    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {      
-#if defined (__SSE4_1__)
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
+#if defined (__aarch64__)
+      m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
       m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
 #else
       const vint4 mask(-1,-1,-1,0);
@@ -526,7 +584,7 @@ namespace embree
   __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
     __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -535,7 +593,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
     __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -626,7 +684,7 @@ namespace embree
   }
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__SSE4_1__)
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
index 694804c40d..d4cc3125cd 100644
--- a/thirdparty/embree/common/math/vec3ia.h
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -65,7 +65,9 @@ namespace embree
 
   __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
   __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
-#if defined(__SSSE3__)
+#if (defined(__aarch64__))
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
   __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
 #endif
 
@@ -81,7 +83,7 @@ namespace embree
   __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
   __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
   __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
   __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
@@ -116,7 +118,7 @@ namespace embree
   __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
   __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
   
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
   __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
 #endif
@@ -127,18 +129,38 @@ namespace embree
   __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
   __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
   
+#if !defined(__ARM_NEON)
   __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
   __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
+  /// Select
   ////////////////////////////////////////////////////////////////////////////////
 
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); }
+  __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); }
+#else
   __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
   __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
   __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
   __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
-
+#endif
+  
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
@@ -156,19 +178,7 @@ namespace embree
   __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
   __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
 
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
-#if defined(__SSE4_1__)
-    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#else
-    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
-#endif
-  }
-
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
   __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
 #else
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
index 0ed107928a..10c53f47b4 100644
--- a/thirdparty/embree/common/math/vec4.h
+++ b/thirdparty/embree/common/math/vec4.h
@@ -149,7 +149,7 @@ namespace embree
   }
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
@@ -205,7 +205,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
@@ -225,7 +225,7 @@ namespace embree
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     x = a.x; y = a.y; z = a.z; w = a.w;
   }
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
   }
diff --git a/thirdparty/embree/common/simd/arm/avx2neon.h b/thirdparty/embree/common/simd/arm/avx2neon.h
new file mode 100644
index 0000000000..dd321d3d64
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/avx2neon.h
@@ -0,0 +1,1196 @@
+#pragma once
+
+#if !defined(__aarch64__)
+#error "avx2neon is only supported for AARCH64"
+#endif
+
+#include "sse2neon.h"
+
+#define AVX2NEON_ABI static inline  __attribute__((always_inline))
+
+
+struct __m256 {
+    __m128 lo,hi;
+    __m256() {}
+};
+
+
+
+
+struct __m256i {
+    __m128i lo,hi;
+    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
+    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
+    __m256i() {}
+};
+
+
+
+
+struct __m256d {
+    float64x2_t lo,hi;
+    __m256d() {}
+    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+};
+
+#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
+
+
+#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
+#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
+
+#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
+
+
+#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
+
+
+
+#define _mm_stream_load_si128 _mm_load_si128
+#define _mm256_stream_load_si256 _mm256_load_si256
+
+
+AVX2NEON_ABI
+__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
+{
+    __m128 af = _mm_castsi128_ps(a);
+    __m128 bf = _mm_castsi128_ps(b);
+    __m128 blendf = _mm_blend_ps(af, bf, imm8);
+    return _mm_castps_si128(blendf);
+}
+
+AVX2NEON_ABI
+int _mm_movemask_popcnt(__m128 a)
+{
+    return __builtin_popcount(_mm_movemask_ps(a));
+}
+
+AVX2NEON_ABI
+__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
+{
+    float32x4_t res;
+    uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
+    for (int i=0;i<4;i++) {
+        if (mask_u32[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+    }
+    return vreinterpretq_m128_f32(res);
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
+{
+    float32x4_t a_f32 = vreinterpretq_f32_m128(a);
+    uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
+    for (int i=0;i<4;i++) {
+        if (mask_u32[i] & 0x80000000) mem_addr[i] = a_f32[i];
+    }
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
+{
+    uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
+    int32x4_t a_s32 = vreinterpretq_s32_m128i(a);
+    for (int i=0;i<4;i++) {
+        if (mask_u32[i] & 0x80000000) mem_addr[i] = a_s32[i];
+    }
+}
+
+
+#define _mm_fmadd_ss _mm_fmadd_ps
+#define _mm_fmsub_ss _mm_fmsub_ps
+#define _mm_fnmsub_ss _mm_fnmsub_ps
+#define _mm_fnmadd_ss _mm_fnmadd_ps
+
+template<int code>
+AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    v = 0;
+    v += (code & 0x10) ? a[0]*b[0] : 0;
+    v += (code & 0x20) ? a[1]*b[1] : 0;
+    v += (code & 0x40) ? a[2]*b[2] : 0;
+    v += (code & 0x80) ? a[3]*b[3] : 0;
+    float32x4_t res;
+    res[0] = (code & 0x1) ? v : 0;
+    res[1] = (code & 0x2) ? v : 0;
+    res[2] = (code & 0x4) ? v : 0;
+    res[3] = (code & 0x8) ? v : 0;
+    return res;
+}
+
+template<>
+inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    m[3] = 0;
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+template<>
+inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
+
+
+AVX2NEON_ABI
+__m128 _mm_permutevar_ps (__m128 a, __m128i b)
+{
+    uint32x4_t b_u32 = vreinterpretq_u32_m128i(b);
+    float32x4_t x;
+    for (int i=0;i<4;i++)
+    {
+        x[i] = a[b_u32[i]];
+    }
+    return vreinterpretq_m128_f32(x);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setzero_si256()
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_setzero_ps()
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(0.0f);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_undefined_si256()
+{
+    return _mm256_setzero_si256();
+}
+
+AVX2NEON_ABI
+__m256 _mm256_undefined_ps()
+{
+    return _mm256_setzero_ps();
+}
+
+CAST_SIMD_TYPE(__m256d, _mm256_castps_pd,    __m256,  float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castps_si256, __m256,  __m128i)
+CAST_SIMD_TYPE(__m256,  _mm256_castsi256_ps, __m256i, __m128)
+CAST_SIMD_TYPE(__m256,  _mm256_castpd_ps ,   __m256d, __m128)
+CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i, float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d, __m128i)
+
+
+
+
+AVX2NEON_ABI
+__m128 _mm256_castps256_ps128 (__m256 a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_castsi128_si256 (__m128i a)
+{
+    __m256i res;
+    res.lo = a ;
+    res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm256_castsi256_si128 (__m256i a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_castps128_ps256 (__m128 a)
+{
+    __m256 res;
+    res.lo = a;
+    res.hi = vdupq_n_f32(0);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ss (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(*mem_addr);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m256i res;
+    res.lo = _mm_set_epi32(e3,e2,e1,e0);
+    res.hi = _mm_set_epi32(e7,e6,e5,e4);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi32 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(a);
+    return res;
+}
+AVX2NEON_ABI
+__m256i _mm256_set1_epi8 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s8(a);
+    return res;
+}
+AVX2NEON_ABI
+__m256i _mm256_set1_epi16 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s16(a);
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_ps(const __m256& v)
+{
+    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_permute_ps (const __m256& a)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
+
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
+    return res;
+
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256i __mm256_shuffle_epi32 (const __m256i a)
+{
+    __m256i res;
+    res.lo = _mm_shuffle_epi32(a.lo,imm8);
+    res.hi = _mm_shuffle_epi32(a.hi,imm8);
+    return res;
+
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256i __mm256_srli_si256 (__m256i a)
+{
+    __m256i res;
+    res.lo = _mm_srli_si128(a.lo,imm8);
+    res.hi = _mm_srli_si128(a.hi,imm8);
+    return res;
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256i __mm256_slli_si256 (__m256i a)
+{
+    __m256i res;
+    res.lo = _mm_slli_si128(a.lo,imm8);
+    res.hi = _mm_slli_si128(a.hi,imm8);
+    return res;
+}
+
+
+#define _mm256_srli_si256(a,b) __mm256_srli_si256<b>(a)
+#define _mm256_slli_si256(a,b) __mm256_slli_si256<b>(a)
+
+
+
+#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
+#define _mm256_shuffle_epi32(a,c) __mm256_shuffle_epi32<c>(a)
+
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi64x (long long a)
+{
+    __m256i res;
+    int64x2_t t = vdupq_n_s64(a);
+    res.lo = res.hi = __m128i(t);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
+{
+    __m256 res;
+    __m128 tmp;
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+
+
+    res.lo = tmp;
+    imm8 >>= 4;
+
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+    res.hi = tmp;
+
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_moveldup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo = _mm_moveldup_ps(a.lo);
+    res.hi = _mm_moveldup_ps(a.hi);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_movehdup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo = _mm_movehdup_ps(a.lo);
+    res.hi = _mm_movehdup_ps(a.hi);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
+{
+    __m256 res = a;
+    if (imm8 & 1) res.hi = b;
+    else res.lo = b;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+
+AVX2NEON_ABI
+__m256d _mm256_movedup_pd (__m256d a)
+{
+    __m256d res;
+    res.lo = _mm_movedup_pd(a.lo);
+    res.hi = _mm_movedup_pd(a.hi);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_abs_epi32(__m256i a)
+{
+   __m256i res;
+   res.lo = vabsq_s32(a.lo);
+   res.hi = vabsq_s32(a.hi);
+   return res;
+}
+
+UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
+UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
+UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
+UNARY_AVX_OP(__m256i,_mm256_abs_epi16,_mm_abs_epi16)
+
+
+BINARY_AVX_OP(__m256i,_mm256_add_epi8,_mm_add_epi8)
+BINARY_AVX_OP(__m256i,_mm256_adds_epi8,_mm_adds_epi8)
+
+BINARY_AVX_OP(__m256i,_mm256_hadd_epi32,_mm_hadd_epi32)
+BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
+BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
+BINARY_AVX_OP(__m256i,_mm256_min_epi16,_mm_min_epi16)
+BINARY_AVX_OP(__m256i,_mm256_max_epi16,_mm_max_epi16)
+BINARY_AVX_OP(__m256i,_mm256_min_epi8,_mm_min_epi8)
+BINARY_AVX_OP(__m256i,_mm256_max_epi8,_mm_max_epi8)
+BINARY_AVX_OP(__m256i,_mm256_min_epu16,_mm_min_epu16)
+BINARY_AVX_OP(__m256i,_mm256_max_epu16,_mm_max_epu16)
+BINARY_AVX_OP(__m256i,_mm256_min_epu8,_mm_min_epu8)
+BINARY_AVX_OP(__m256i,_mm256_max_epu8,_mm_max_epu8)
+BINARY_AVX_OP(__m256i,_mm256_sign_epi16,_mm_sign_epi16)
+
+
+BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
+BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
+
+BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
+BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
+
+BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
+BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
+BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
+BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
+
+BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
+BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
+BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
+BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
+
+BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
+BINARY_AVX_OP(__m256i,_mm256_andnot_si256,_mm_andnot_si128)
+BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
+BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
+
+
+BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
+BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
+TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
+TERNARY_AVX_OP(__m256i,_mm256_blendv_epi8,_mm_blendv_epi8)
+
+
+TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_packs_epi32,_mm_packs_epi32)
+BINARY_AVX_OP(__m256i,_mm256_packs_epi16,_mm_packs_epi16)
+BINARY_AVX_OP(__m256i,_mm256_packus_epi32,_mm_packus_epi32)
+BINARY_AVX_OP(__m256i,_mm256_packus_epi16,_mm_packus_epi16)
+
+
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi64,_mm_unpackhi_epi64)
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi16,_mm_unpackhi_epi16)
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi8,_mm_unpackhi_epi8)
+
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi64,_mm_unpacklo_epi64)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi16,_mm_unpacklo_epi16)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi8,_mm_unpacklo_epi8)
+
+BINARY_AVX_OP(__m256i,_mm256_mulhrs_epi16,_mm_mulhrs_epi16)
+BINARY_AVX_OP(__m256i,_mm256_mulhi_epu16,_mm_mulhi_epu16)
+BINARY_AVX_OP(__m256i,_mm256_mulhi_epi16,_mm_mulhi_epi16)
+//BINARY_AVX_OP(__m256i,_mm256_mullo_epu16,_mm_mullo_epu16)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi16,_mm_mullo_epi16)
+
+BINARY_AVX_OP(__m256i,_mm256_subs_epu16,_mm_subs_epu16)
+BINARY_AVX_OP(__m256i,_mm256_adds_epu16,_mm_adds_epu16)
+BINARY_AVX_OP(__m256i,_mm256_subs_epi16,_mm_subs_epi16)
+BINARY_AVX_OP(__m256i,_mm256_adds_epi16,_mm_adds_epi16)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi16,_mm_sub_epi16)
+BINARY_AVX_OP(__m256i,_mm256_add_epi16,_mm_add_epi16)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi8,_mm_sub_epi8)
+
+
+BINARY_AVX_OP(__m256i,_mm256_hadd_epi16,_mm_hadd_epi16)
+BINARY_AVX_OP(__m256i,_mm256_hadds_epi16,_mm_hadds_epi16)
+
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi8,_mm_cmpeq_epi8)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi8,_mm_cmpgt_epi8)
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi16,_mm_cmpeq_epi16)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi16,_mm_cmpgt_epi16)
+
+
+BINARY_AVX_OP(__m256i,_mm256_shuffle_epi8,_mm_shuffle_epi8)
+
+
+BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvtps_epi32(a.lo);
+    res.hi = _mm_cvtps_epi32(a.hi);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvttps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvttps_epi32(a.lo);
+    res.hi = _mm_cvttps_epi32(a.hi);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256 _mm256_loadu_ps (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = *(__m128 *)(mem_addr + 0);
+    res.hi = *(__m128 *)(mem_addr + 4);
+    return res;
+}
+#define _mm256_load_ps _mm256_loadu_ps
+
+
+AVX2NEON_ABI
+int _mm256_testz_ps (const __m256& a, const __m256& b)
+{
+    __m256 t = a;
+    if (&a != &b)
+        t = _mm256_and_ps(a,b);
+
+    int32x4_t l  = vshrq_n_s32(vreinterpretq_s32_m128(t.lo),31);
+    int32x4_t h  = vshrq_n_s32(vreinterpretq_s32_m128(t.hi),31);
+    return vaddvq_s32(vaddq_s32(l,h)) == 0;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+AVX2NEON_ABI
+__m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
+{
+    int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
+    int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5, char e6, char e7, char e8, char e9, char e10, char e11, char e12, char e13, char e14, char e15, char e16, char e17, char e18, char e19, char e20, char e21, char e22, char e23, char e24, char e25, char e26, char e27, char e28, char e29, char e30, char e31)
+{
+    int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
+    int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
+{
+    int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
+    int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setr_epi16 (short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15)
+{
+    int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
+    int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_epi8(const __m256i& a)
+{
+    return (_mm_movemask_epi8(a.hi) << 16) | _mm_movemask_epi8(a.lo);
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_si256(const __m256i& a,const __m256i& b)
+{
+    uint32x4_t lo = vandq_u32(a.lo,b.lo);
+    uint32x4_t hi = vandq_u32(a.hi,b.hi);
+
+    return (vaddvq_u32(lo) + vaddvq_u32(hi)) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_setzero_pd ()
+{
+    __m256d res;
+    res.lo = res.hi = vdupq_n_f64(0);
+    return res;
+}
+
+AVX2NEON_ABI
+int _mm256_movemask_pd (__m256d a)
+{
+    return (_mm_movemask_pd(a.hi) << 2) | _mm_movemask_pd(a.lo);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
+{
+    __m256i res;
+    res.lo = _mm_cmpeq_epi64(a.lo, b.lo);
+    res.hi = _mm_cmpeq_epi64(a.hi, b.hi);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_cmpeq_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = _mm_cmpeq_pd(a.lo, b.lo);
+    res.hi = _mm_cmpeq_pd(a.hi, b.hi);
+    return res;
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_pd (const __m256d& a, const __m256d& b)
+{
+    __m256d t = a;
+
+    if (&a != &b)
+        t = _mm256_and_pd(a,b);
+
+    return _mm256_movemask_pd(t) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
+{
+    __m256d res;
+    res.lo = _mm_blendv_pd(a.lo, b.lo, mask.lo);
+    res.hi = _mm_blendv_pd(a.hi, b.hi, mask.hi);
+    return res;
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_dp_ps (__m256 a, __m256 b)
+{
+    __m256 res;
+    res.lo = _mm_dp_ps(a.lo, b.lo, imm8);
+    res.hi = _mm_dp_ps(a.hi, b.hi, imm8);
+    return res;
+}
+
+#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
+
+AVX2NEON_ABI
+double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
+{
+    switch (imm8 & 3) {
+        case 0:
+            return ((float64x2_t)a.lo)[0];
+        case 1:
+            return ((float64x2_t)a.lo)[1];
+        case 2:
+            return ((float64x2_t)a.hi)[0];
+        case 3:
+            return ((float64x2_t)a.hi)[1];
+    }
+    __builtin_unreachable();
+    return 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
+{
+    float64x2_t lo,hi;
+    lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
+    lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
+    hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
+    hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+
+    __m256d res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
+{
+    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
+{
+    __m256i res;
+    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
+    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
+    return res;
+}
+
+#define _mm256_load_si256 _mm256_loadu_si256
+
+AVX2NEON_ABI
+void _mm256_storeu_ps (float * mem_addr, __m256 a)
+{
+    *(__m128 *)(mem_addr + 0) = a.lo;
+    *(__m128 *)(mem_addr + 4) = a.hi;
+}
+
+#define _mm256_store_ps _mm256_storeu_ps
+#define _mm256_stream_ps _mm256_storeu_ps
+
+
+AVX2NEON_ABI
+void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
+{
+    *(__m128i *)((int32_t *)mem_addr + 0) = a.lo;
+    *(__m128i *)((int32_t *)mem_addr + 4) = a.hi;
+}
+
+#define _mm256_store_si256 _mm256_storeu_si256
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8)
+{
+    uint8x16x2_t tbl = {a.lo, a.hi};
+
+    uint8_t sz = sizeof(uint64_t);
+    uint8_t u64[4] = {
+        (uint8_t)(((imm8 >> 0) & 0x3) * sz),
+        (uint8_t)(((imm8 >> 2) & 0x3) * sz),
+        (uint8_t)(((imm8 >> 4) & 0x3) * sz),
+        (uint8_t)(((imm8 >> 6) & 0x3) * sz),
+    };
+
+    uint8x16_t idx_lo = {
+        // lo[0] bytes
+        (uint8_t)(u64[0]+0), (uint8_t)(u64[0]+1), (uint8_t)(u64[0]+2), (uint8_t)(u64[0]+3),
+        (uint8_t)(u64[0]+4), (uint8_t)(u64[0]+5), (uint8_t)(u64[0]+6), (uint8_t)(u64[0]+7),
+
+        // lo[1] bytes
+        (uint8_t)(u64[1]+0), (uint8_t)(u64[1]+1), (uint8_t)(u64[1]+2), (uint8_t)(u64[1]+3),
+        (uint8_t)(u64[1]+4), (uint8_t)(u64[1]+5), (uint8_t)(u64[1]+6), (uint8_t)(u64[1]+7),
+    };
+    uint8x16_t idx_hi = {
+        // hi[0] bytes
+        (uint8_t)(u64[2]+0), (uint8_t)(u64[2]+1), (uint8_t)(u64[2]+2), (uint8_t)(u64[2]+3),
+        (uint8_t)(u64[2]+4), (uint8_t)(u64[2]+5), (uint8_t)(u64[2]+6), (uint8_t)(u64[2]+7),
+
+        // hi[1] bytes
+        (uint8_t)(u64[3]+0), (uint8_t)(u64[3]+1), (uint8_t)(u64[3]+2), (uint8_t)(u64[3]+3),
+        (uint8_t)(u64[3]+4), (uint8_t)(u64[3]+5), (uint8_t)(u64[3]+6), (uint8_t)(u64[3]+7),
+    };
+
+    uint8x16_t lo = vqtbl2q_u8(tbl, idx_lo);
+    uint8x16_t hi = vqtbl2q_u8(tbl, idx_hi);
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2x128_si256(const __m256i a,const __m256i b, const int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+
+AVX2NEON_ABI
+__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
+{
+    __m256 res;
+    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
+    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi32 (__m128i a)
+{
+    uint8x16_t a_u8 = vreinterpretq_u8_m128i(a);     // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(a_u8));  // 00HH 00GG 00FF 00EE 00DD 00CC 00BB 00AA
+    uint32x4_t lo = vmovl_u16(vget_low_u16(u16x8));  // 0000 00DD 0000 00CC 0000 00BB 0000 00AA
+    uint32x4_t hi = vmovl_high_u16(u16x8);           // 0000 00HH 0000 00GG 0000 00FF 0000 00EE
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi8_epi32 (__m128i a)
+{
+    int8x16_t a_s8 = vreinterpretq_s8_m128i(a);     // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(a_s8));  // ssHH ssGG ssFF ssEE ssDD ssCC ssBB ssAA
+    int32x4_t lo = vmovl_s16(vget_low_s16(s16x8));  // ssss ssDD ssss ssCC ssss ssBB ssss ssAA
+    int32x4_t hi = vmovl_high_s16(s16x8);           // ssss ssHH ssss ssGG ssss ssFF ssss ssEE
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi16_epi32 (__m128i a)
+{
+    int16x8_t a_s16 = vreinterpretq_s16_m128i(a);   // HHHH GGGG FFFF EEEE DDDD CCCC BBBB AAAA
+    int32x4_t lo = vmovl_s16(vget_low_s16(a_s16));  // ssss DDDD ssss CCCC ssss BBBB ssss AAAA
+    int32x4_t hi = vmovl_high_s16(a_s16);           // ssss HHHH ssss GGGG ssss FFFF ssss EEEE
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
+{
+    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
+    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi64 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi64(a.lo,imm8);
+    res.hi = _mm_slli_epi64(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi32(a.lo,imm8);
+    res.hi = _mm_slli_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i __mm256_slli_epi16 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi16(a.lo,imm8);
+    res.hi = _mm_slli_epi16(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi32(a.lo,imm8);
+    res.hi = _mm_srli_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i __mm256_srli_epi16 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi16(a.lo,imm8);
+    res.hi = _mm_srli_epi16(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu16_epi32(__m128i a)
+{
+    __m256i res;
+    res.lo = vmovl_u16(vget_low_u16(a));
+    res.hi = vmovl_high_u16(a);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi16(__m128i a)
+{
+    __m256i res;
+    res.lo = vmovl_u8(vget_low_u8(a));
+    res.hi = vmovl_high_u8(a);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi32(a.lo,imm8);
+    res.hi = _mm_srai_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi16 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi16(a.lo,imm8);
+    res.hi = _mm_srai_epi16(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,count.lo);
+    res.hi = vshlq_s32(a.hi,count.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
+    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
+    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set1_ps(float x)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(x);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res;
+    res.lo = _mm_set_ps(e3,e2,e1,e0);
+    res.hi = _mm_set_ps(e7,e6,e5,e4);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = *mem_addr;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_cvtepi32_ps (__m256i a)
+{
+    __m256 res;
+    res.lo = _mm_cvtepi32_ps(a.lo);
+    res.hi = _mm_cvtepi32_ps(a.hi);
+    return res;
+}
+AVX2NEON_ABI
+void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
+{
+    uint32x4_t mask_lo = mask.lo;
+    uint32x4_t mask_hi = mask.hi;
+    float32x4_t a_lo = a.lo;
+    float32x4_t a_hi = a.hi;
+
+    for (int i=0;i<4;i++) {
+        if (mask_lo[i] & 0x80000000) mem_addr[i] = a_lo[i];
+        if (mask_hi[i] & 0x80000000) mem_addr[i+4] = a_hi[i];
+    }
+}
+
+AVX2NEON_ABI
+__m256d _mm256_andnot_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
+    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
+{
+    __m256 res;
+    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
+{
+    return __m256i(_mm256_blend_ps(__m256(a),__m256(b),imm8));
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
+{
+    __m256i res;
+    res.lo = _mm_blend_epi16(a.lo,b.lo,imm8);
+    res.hi = _mm_blend_epi16(a.hi,b.hi,imm8);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
+{
+    int32x4_t vindex_lo = vindex.lo;
+    int32x4_t vindex_hi = vindex.hi;
+    int32x4_t lo,hi;
+    for (int i=0;i<4;i++)
+    {
+        lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
+        hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
+    }
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
+{
+    uint32x4_t mask_lo = mask.lo;
+    uint32x4_t mask_hi = mask.hi;
+    int32x4_t vindex_lo = vindex.lo;
+    int32x4_t vindex_hi = vindex.hi;
+    int32x4_t lo,hi;
+    lo = hi = _mm_setzero_si128();
+    for (int i=0;i<4;i++)
+    {
+        if (mask_lo[i] >> 31) lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
+        if (mask_hi[i] >> 31) hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
+    }
+
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+}
diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h
index 1c3875fb27..4327298019 100644
--- a/thirdparty/embree/common/simd/arm/emulation.h
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -11,33 +11,28 @@
 
 #include "sse2neon.h"
 
-__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
-   __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
-  return _mm_fmadd_ps(a, b, neg_c);
-}
-
-__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-#else
-    return _mm_sub_ps(c, _mm_mul_ps(a, b));
-#endif
-}
+__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }
+
+__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }
+__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }
+__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }
+__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }
 
-__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
-  return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
+__forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
 }
 
+// AVX2 emulation leverages Intel FMA defs above.  Include after them.
+#include "avx2neon.h"
 
 /* Dummy defines for floating point control */
 #define _MM_MASK_MASK 0x1f80
 #define _MM_MASK_DIV_ZERO 0x200
-#define _MM_FLUSH_ZERO_ON 0x8000
+// #define _MM_FLUSH_ZERO_ON 0x8000
 #define _MM_MASK_DENORM 0x100
 #define _MM_SET_EXCEPTION_MASK(x)
-#define _MM_SET_FLUSH_ZERO_MODE(x)
+// #define _MM_SET_FLUSH_ZERO_MODE(x)
 
 __forceinline int _mm_getcsr()
 {
@@ -48,3 +43,43 @@ __forceinline void _mm_mfence()
 {
   __sync_synchronize();
 }
+
+__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
+{
+    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t t1 = vmovl_u8(t0);
+    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)
+{
+    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
+    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
+    return vreinterpretq_s32_u32(t1);
+}
+
+__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)
+{
+    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
+    int16x8_t   t1 = vmovl_s8(t0);
+    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
+    float32x4_t t3 = vcvtq_f32_s32(t2);
+    return vreinterpretq_s32_f32(t3);
+}
+
+__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)
+{
+    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t  t1 = vmovl_u8(t0);
+    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)
+{
+    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
+    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
+    float32x4_t t2 = vcvtq_f32_s32(t1);
+    return vreinterpretq_s32_f32(t2);
+}
diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h
index 7eb25cf2c5..43416662d7 100644
--- a/thirdparty/embree/common/simd/arm/sse2neon.h
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
@@ -52,7 +52,7 @@
 
 /* Enable precise implementation of math operations
  * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
  */
 /* _mm_min_ps and _mm_max_ps */
 #ifndef SSE2NEON_PRECISE_MINMAX
@@ -66,36 +66,29 @@
 #ifndef SSE2NEON_PRECISE_SQRT
 #define SSE2NEON_PRECISE_SQRT (0)
 #endif
-#ifndef SSE2NEON_PRECISE_RSQRT
-#define SSE2NEON_PRECISE_RSQRT (0)
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
 #endif
 
+/* compiler specific definitions */
 #if defined(__GNUC__) || defined(__clang__)
 #pragma push_macro("FORCE_INLINE")
 #pragma push_macro("ALIGN_STRUCT")
 #define FORCE_INLINE static inline __attribute__((always_inline))
 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#ifndef likely
-#define likely(x) __builtin_expect(!!(x), 1)
-#endif
-#ifndef unlikely
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#endif
-#else
-#error "Macro name collisions may happen with unsupported compiler."
-#ifdef FORCE_INLINE
-#undef FORCE_INLINE
-#endif
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
 #define FORCE_INLINE static inline
+#endif
 #ifndef ALIGN_STRUCT
 #define ALIGN_STRUCT(x) __declspec(align(x))
 #endif
-#endif
-#ifndef likely
-#define likely(x) (x)
-#endif
-#ifndef unlikely
-#define unlikely(x) (x)
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
 #endif
 
 #include <stdint.h>
@@ -155,6 +148,14 @@
  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
  * fp0 is the same for fp0 of result.
  */
+#if defined(__aarch64__)
+#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
+#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
+4)+16+3) } )
+#endif
+
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
@@ -169,6 +170,14 @@
 #define _MM_ROUND_DOWN 0x2000
 #define _MM_ROUND_UP 0x4000
 #define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
 
 /* indicate immediate constant argument in a given range */
 #define __constrange(a, b) const
@@ -189,7 +198,10 @@ typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
 #endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+// Note: upstream sse2neon declares __m128i as int64x2_t.  However, there's
+// many places within embree that assume __m128i can be indexed as a
+// 4 element u32.
+typedef int32x4_t __m128i; /* 128-bit vector containing integers */
 
 /* type-safe casting between types */
 
@@ -221,28 +233,28 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
 
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s32_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s32_s16(x)
+#define vreinterpretq_m128i_s32(x) (x)
+#define vreinterpretq_m128i_s64(x) vreinterpretq_s32_s64(x)
 
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s32_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s32_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s32_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s32_u64(x)
 
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s32(x)
 
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s32(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s32(x)
+#define vreinterpretq_s32_m128i(x) (x)
+#define vreinterpretq_s64_m128i(x) vreinterpretq_s64_s32(x)
 
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s32(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s32(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s32(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s32(x)
 
 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
@@ -281,6 +293,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
 
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
 
 #define vreinterpretq_f64_m128d(x) (x)
@@ -303,10 +316,10 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 #endif
 
 // A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an _m128 struct
+// by applications which attempt to access the contents of an __m128 struct
 // directly.  It is important to note that accessing the __m128 struct directly
 // is bad coding practice by Microsoft: @see:
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+// https://docs.microsoft.com/en-us/cpp/cpp/m128
 //
 // However, some legacy source code may try to access the contents of an __m128
 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
@@ -342,13 +355,48 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
 
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
 /* Backwards compatibility for compilers with lack of specific type support */
 
 // Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&   \
-    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
-     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
-     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 {
     uint8x16x4_t ret;
@@ -443,8 +491,6 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
     +------+------+------+------+------+------+-------------+
  */
 
-/* Set/get methods */
-
 /* Constants for use with _mm_prefetch.  */
 enum _mm_hint {
     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
@@ -457,1098 +503,1568 @@ enum _mm_hint {
     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
 };
 
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
 {
-    (void) i;
-    __builtin_prefetch(p);
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
 }
 
-// Pause the processor. This is typically used in spin-wait loops and depending
-// on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
-// Arm cores. Experience with several databases has shown has shown an 'isb' is
-// a reasonable approximation.
-FORCE_INLINE void _mm_pause()
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
 {
-    __asm__ __volatile__("isb\n");
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
 }
 
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
 {
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
 }
 
-// Convert the lower single-precision (32-bit) floating-point element in b to a
-// double-precision (64-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
 //
-//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
-//   dst[127:64] := a[127:64]
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
-FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
 #else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
 #endif
-}
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-FORCE_INLINE int _mm_cvtss_si64(__m128 a)
-{
+    // De-interleave
 #if defined(__aarch64__)
-    return vgetq_lane_s64(
-        vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
 #else
-    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t diff = data - floor(data);
-    if (diff > 0.5)
-        return (int64_t) ceil(data);
-    if (unlikely(diff == 0.5)) {
-        int64_t f = (int64_t) floor(data);
-        int64_t c = (int64_t) ceil(data);
-        return c & 1 ? f : c;
-    }
-    return (int64_t) floor(data);
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
 #endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
 }
+#endif  // ARMv7 polyfill
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
 {
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
 }
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
 {
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
 }
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
 {
-    return vgetq_lane_s64(
-        vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
 }
 
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128(void)
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
 {
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
 }
 
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
 {
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
 }
 
-// Return vector of type __m128d with all elements set to zero.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
 {
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
 #else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
 #endif
-}
 
-// Sets the four single-precision, floating-point values to w.
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
 //
-//   r0 := r1 := r2 := r3 := w
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* SSE */
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
 }
 
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
 {
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 }
 
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 {
-    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
 }
 
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
 {
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
 }
 
-// Sets the 8 signed 16-bit integer values in reverse order.
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
 //
-// Return Value
-//   r0 := w0
-//   r1 := w1
-//   ...
-//   r7 := w7
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
 {
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
 }
 
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
 {
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
 }
 
-// Sets the 16 signed 8-bit integer values to b.
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for greater than.
 //
-//   r0 := b
-//   r1 := b
-//   ...
-//   r15 := b
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
 //
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
 }
 
-// Sets the 8 signed 16-bit integer values to w.
+// Compares for less than or equal.
 //
-//   r0 := w
-//   r1 := w
-//   ...
-//   r7 := w
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
 //
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
 }
 
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
 {
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
 }
 
-// Sets the 4 signed 32-bit integer values to i.
-//
-//   r0 := i
-//   r1 := i
-//   r2 := i
-//   r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
 }
 
-// Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
 {
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
 }
 
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
 {
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
 }
 
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
 {
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
-FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
 {
-    return _mm_set_pd(e0, e1);
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
 }
 
-// Copy double-precision (64-bit) floating-point element a to the lower element
-// of dst, and zero the upper element.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
-FORCE_INLINE __m128d _mm_set_sd(double a)
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
 {
-    return _mm_set_pd(0, a);
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
-#define _mm_set_pd1 _mm_set1_pd
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
 
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
 {
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
 }
 
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[31:0]
-//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
-//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
-FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
 {
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    vst1q_f32(p, vdupq_n_f32(a0));
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
 }
 
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[31:0]
-//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
-//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
-#define _mm_store1_ps _mm_store_ps1
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
 
-// Store 4 single-precision (32-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[127:96]
-//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
-//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
-FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
 {
-    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-    float32x4_t rev = vextq_f32(tmp, tmp, 2);
-    vst1q_f32(p, rev);
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
 }
 
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
 {
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
 }
 
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
 {
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
 }
 
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
 {
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
 }
 
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
 {
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
 }
 
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
 }
 
-// Store the upper double-precision (64-bit) floating-point element from a into
-// memory.
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
 //
-//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
-FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 {
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
-#endif
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
 }
 
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory.
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
 //
-//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
-FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
 #if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
 #else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
 #endif
 }
 
-// Store 2 double-precision (64-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
 //
-//   MEM[mem_addr+63:mem_addr] := a[127:64]
-//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
-FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 {
-    float32x4_t f = vreinterpretq_f32_m128d(a);
-    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
 }
 
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
 #if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
 #else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
 #endif
 }
 
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
 {
-    _mm_store_pd(mem_addr, a);
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
 }
 
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 {
-    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
 }
 
-// Stores the lower two single-precision floating point values of a to the
-// address p.
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
 //
-//   *p0 := a0
-//   *p1 := a1
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
 //
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
 {
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
 }
 
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
 //
-//   *p0 := a2
-//   *p1 := a3
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
 {
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
 }
 
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+//   FOR j := 0 to 3
+//     i := 16*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+15:i] := 0x7FFF
+//     ELSE
+//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 {
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+    const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
+    const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
 }
 
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
 //
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
 
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
 //
-// Return Value
-//   r0 := *p0
-//   r1 := *p1
-//   r2 := a2
-//   r3 := a3
+//   FOR j := 0 to 3
+//     i := 8*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+7:i] := 0x7F
+//     ELSE
+//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+//     FI
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
+    const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
+    int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
+    uint32_t bitMask[2] = {0xFFFFFFFF, 0};
+    int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
+
+    return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
 {
     return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
 }
 
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
 //
-//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
-//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
-//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 {
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
 }
 
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
 //
-//   r0 := a0
-//   r1 := a1
-//   r2 := *p0
-//   r3 := *p1
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
 //
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
 {
     return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
 }
 
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 {
-    return vreinterpretq_m128_f32(vld1q_f32(p));
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 }
 
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
 }
 
-// Load unaligned 16-bit integer from memory into the first element of dst.
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
 //
-//   dst[15:0] := MEM[mem_addr+15:mem_addr]
-//   dst[MAX:16] := 0
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
 {
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
 }
 
-// Load unaligned 64-bit integer from memory into the first element of dst.
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
 //
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[MAX:64] := 0
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 {
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
 }
 
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
 //
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := 0
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 }
 
-// Loads two double-precision from 16-byte aligned memory, floating-point
-// values.
+// Divides the four single-precision, floating-point values of a and b.
 //
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
 #endif
 }
 
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 {
-    return _mm_load_pd(p);
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
 
-// Load 64-bit integer from memory into the first element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
 {
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+    free(addr);
 }
 
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
 {
+    union {
+        fpcr_bitfield field;
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+        uint64_t value;
 #else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+        uint32_t value;
 #endif
-}
+    } r;
 
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
 #if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
 }
 
-// Sets the low word to the single-precision, floating-point value of b
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
 {
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
 }
 
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-//
-//   dst[63:0] := b[63:0]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
 {
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+    return vreinterpretq_m128_f32(vld1q_f32(p));
 }
 
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
 //
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
 {
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
 }
 
-// Return vector of type __m128 with undefined elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
-FORCE_INLINE __m128 _mm_undefined_ps(void)
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
 {
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128 a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
 }
 
-/* Logic/Binary operations */
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
 //
-//   r0 := ~a0 & b0
-//   r1 := ~a1 & b1
-//   r2 := ~a2 & b2
-//   r3 := ~a3 & b3
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
 {
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
 }
 
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
 //
-//   FOR j := 0 to 1
-// 	     i := j*64
-// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
-//   ENDFOR
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
 {
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
 }
 
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
 //
-//   r := (~a) & b
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
 {
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
 }
 
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-//   r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 {
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
 }
 
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
+// Load unaligned 16-bit integer from memory into the first element of dst.
 //
-//   r0 := a0 & b0
-//   r1 := a1 & b1
-//   r2 := a2 & b2
-//   r3 := a3 & b3
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
 }
 
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
+// Load unaligned 64-bit integer from memory into the first element of dst.
 //
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
-//   ENDFOR
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
 }
 
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
 }
 
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 {
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
 }
 
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
 //
-//   FOR j := 0 to 1
-//      i := j*64
-//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
 {
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
 }
 
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
 }
 
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
 //
-//   r := a | b
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
 {
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
 }
 
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-// Duplicate the low double-precision (64-bit) floating-point element from a,
-// and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
-FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
 {
-#if (__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_u64(
-        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
-#endif
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
 }
 
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 {
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
 #else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #endif
 }
 
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
 {
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
 }
 
 // Moves the upper two values of B into the lower two values of A.
@@ -1577,315 +2093,419 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
 }
 
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
 }
 
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
 }
 
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
+// Multiplies the four single-precision, floating-point values of a and b.
 //
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
 {
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
 //
-//   FOR j := 0 to 1
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
 {
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
 }
 
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
 {
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
 }
 
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
 //
 //   FOR j := 0 to 7
 //     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
 
-// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
 //
-//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
-//   dst[127:0] := tmp[127:0]
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        __m128i ret;                                                          \
-        if (unlikely((imm) >= 32)) {                                          \
-            ret = _mm_setzero_si128();                                        \
-        } else {                                                              \
-            uint8x16_t tmp_low, tmp_high;                                     \
-            if (imm >= 16) {                                                  \
-                const int idx = imm - 16;                                     \
-                tmp_low = vreinterpretq_u8_m128i(a);                          \
-                tmp_high = vdupq_n_u8(0);                                     \
-                ret =                                                         \
-                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                          \
-                const int idx = imm;                                          \
-                tmp_low = vreinterpretq_u8_m128i(b);                          \
-                tmp_high = vreinterpretq_u8_m128i(a);                         \
-                ret =                                                         \
-                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
-            }                                                                 \
-        }                                                                     \
-        ret;                                                                  \
-    })
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
 
-// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 8 bytes in dst.
-//
-//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
-//   dst[63:0] := tmp[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
-#define _mm_alignr_pi8(a, b, imm)                                           \
-    __extension__({                                                         \
-        __m64 ret;                                                          \
-        if (unlikely((imm) >= 16)) {                                        \
-            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
-        } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if (imm >= 8) {                                                 \
-                const int idx = imm - 8;                                    \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = imm;                                        \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
-        }                                                                   \
-        ret;                                                                \
-    })
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
 
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
 
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
 {
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+    (void) i;
+    __builtin_prefetch(p);
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 {
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
 {
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+    return _mm_move_ss(a, _mm_rcp_ps(a));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// The current precision is 1% error.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
 {
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
 {
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
 {
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }
 
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 {
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
 {
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
 {
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
 {
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
 {
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+FORCE_INLINE void _mm_setcsr(unsigned int a)
 {
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+    _MM_SET_ROUNDING_MODE(a);
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
 {
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
 {
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
 }
 
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pi16(a, imm)                                           \
     __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
+        vreinterpret_m64_s16(__builtin_shufflevector(                      \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
     })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
 
 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
 // int imm)
@@ -1963,1876 +2583,1721 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
     })
 #endif
 
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
 }
 
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
 {
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
 }
 
-// rotates the least significant 32 bits into the most signficant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 {
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
 }
 
-// rotates the most significant 32 bits into the least signficant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
 {
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
 }
 
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 {
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
 }
 
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
 {
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
 }
 
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 {
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
 }
 
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
 {
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
 }
 
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 {
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
 }
 
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+// Stores 16-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
 {
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
 }
 
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+// Stores 64-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
 {
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
 }
 
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
 
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
 #else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
 #endif
+}
 
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm)                              \
-    __extension__({                                            \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
-        int32x4_t _shuf = __builtin_shufflevector(             \
-            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                        \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
 
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
 
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = __builtin_shufflevector(                   \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
 
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
 
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm)                             \
-    __extension__({                                             \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
-        int16x8_t _shuf = __builtin_shufflevector(              \
-            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-            (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                         \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+// Return vector of type __m128i with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
 
-// Shuffle double-precision (64-bit) floating-point elements using the control
-// in imm8, and store the results in dst.
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
 //
-//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
-//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_pd(a, b, imm8)                                          \
-    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
-        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
-        ((imm8 & 0x2) >> 1) + 2))
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
 #endif
+}
 
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
 //
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF imm8[j]
-//           dst[i+15:i] := b[i+15:i]
-//       ELSE
-//           dst[i+15:i] := a[i+15:i]
-//       FI
-//   ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                        \
-    __extension__({                                                       \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
-    })
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
 //
-//   FOR j := 0 to 15
-//       i := j*8
-//       IF mask[i+7]
-//           dst[i+7:i] := b[i+7:i]
-//       ELSE
-//           dst[i+7:i] := a[i+7:i]
-//       FI
-//   ENDFOR
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
 }
 
-/* Shifts */
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
 
+/* SSE2 */
 
-// Shift packed 16-bit integers in a right by imm while shifting in sign
-// bits, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
 {
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
 //
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
 //
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm)                                   \
-    __extension__({                                              \
-        __m128i ret;                                             \
-        if (unlikely((imm)) <= 0) {                              \
-            ret = a;                                             \
-        }                                                        \
-        if (unlikely((imm) > 15)) {                              \
-            ret = _mm_setzero_si128();                           \
-        } else {                                                 \
-            ret = vreinterpretq_m128i_s16(                       \
-                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
-        }                                                        \
-        ret;                                                     \
-    })
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros. :
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
 {
-    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (unlikely(imm > 31))
-        return _mm_setzero_si128();
     return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
 {
-    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (unlikely(imm > 63))
-        return _mm_setzero_si128();
     return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 }
 
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
 // store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
 //
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (unlikely(imm) == 0) {                                          \
-            ret = a;                                                       \
-        }                                                                  \
-        if (likely(0 < (imm) && (imm) < 16)) {                             \
-            ret = vreinterpretq_m128i_u16(                                 \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
 
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
+// Add 64-bit integers a and b, and store the result in dst.
 //
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
+//   dst[63:0] := a[63:0] + b[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (unlikely((imm) == 0)) {                                        \
-            ret = a;                                                       \
-        }                                                                  \
-        if (likely(0 < (imm) && (imm) < 32)) {                             \
-            ret = vreinterpretq_m128i_u32(                                 \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
 
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
 //
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF imm8[7:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (unlikely((imm) == 0)) {                                        \
-            ret = a;                                                       \
-        }                                                                  \
-        if (likely(0 < (imm) && (imm) < 64)) {                             \
-            ret = vreinterpretq_m128i_u64(                                 \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
 
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
 //
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-//     ELSE
-//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (unlikely((imm) == 0)) {                                        \
-            ret = a;                                                       \
-        }                                                                  \
-        if (likely(0 < (imm) && (imm) < 32)) {                             \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
-        }                                                                  \
-        ret;                                                               \
-    })
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
 
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
-// zeros.imm must be an immediate.
-//
-//   r := srl(a, imm*8)
-//
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm)                                              \
-    __extension__({                                                         \
-        __m128i ret;                                                        \
-        if (unlikely((imm) <= 0)) {                                         \
-            ret = a;                                                        \
-        }                                                                   \
-        if (unlikely((imm) > 15)) {                                         \
-            ret = _mm_setzero_si128();                                      \
-        } else {                                                            \
-            ret = vreinterpretq_m128i_s8(                                   \
-                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
-        }                                                                   \
-        ret;                                                                \
-    })
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
 
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
-// must be an immediate.
-//
-//   r := a << (imm * 8)
-//
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm)                                          \
-    __extension__({                                                     \
-        __m128i ret;                                                    \
-        if (unlikely((imm) <= 0)) {                                     \
-            ret = a;                                                    \
-        }                                                               \
-        if (unlikely((imm) > 15)) {                                     \
-            ret = _mm_setzero_si128();                                  \
-        } else {                                                        \
-            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
-                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
-        }                                                               \
-        ret;                                                            \
-    })
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
 
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
 //
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros.
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
 //
-// r0 := a0 << count
-// r1 := a1 << count
-// r2 := a2 << count
-// r3 := a3 << count
+//   r := a & b
 //
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
-// shifting in zeros.
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
 //
-// r0 := a0 << count
-// r1 := a1 << count
+//   FOR j := 0 to 1
+//       i := j*64
+//       dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
 }
 
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
 //
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// ...
-// r7 := srl(a7, count)
+//   r := (~a) & b
 //
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
 }
 
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
 //
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// r2 := srl(a2, count)
-// r3 := srl(a3, count)
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
 //
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
 }
 
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
-// while shifting in zeros.
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
 //
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
 //
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
 {
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (unlikely(c > 63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
 
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
 
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
 
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
 }
 
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
 {
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
 }
 
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 {
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
 }
 
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-    uint32x4_t input = vreinterpretq_u32_m128(a);
 #if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
 #endif
 }
 
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
 {
-    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
 }
 
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
 {
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
-                                                                           : 1;
+    (void) p;
+    // no corollary for Neon?
 }
 
-/* Math operations */
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
 
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] - b[31:0]
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 {
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-//    r0 := a0 - b0
-//    r1 := a1 - b1
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
 }
 
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
 }
 
-// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
 //
-//   dst[63:0] := a[63:0] - b[63:0]
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
 {
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
 //
-//   r0 := UnsignedSaturate(a0 - b0)
-//   r1 := UnsignedSaturate(a1 - b1)
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
 //   ...
-//   r15 := UnsignedSaturate(a15 - b15)
+//   r15 := (a15 > b15) ? 0xff : 0x0
 //
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Subtract packed double-precision (64-bit) floating-point elements in b from
-// packed double-precision (64-bit) floating-point elements in a, and store the
-// results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
-//   ENDFOR
-//
-//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
-FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
-// Subtract the lower double-precision (64-bit) floating-point element in b from
-// the lower double-precision (64-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
-FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-    return _mm_move_sd(a, _mm_sub_pd(a, b));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Add packed unsigned 16-bit integers in a and b using saturation, and store
-// the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..15
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
 {
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
 
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
 
-    // (b == 0) ? 0xFF : 0
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
 #if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
-
-    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
 }
 
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..7
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
 #if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
 }
 
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..3
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
 #if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
 #endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
 }
 
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      IF b[i+15:i] < 0
-//        dst[i+15:i] := -(a[i+15:i])
-//      ELSE IF b[i+15:i] == 0
-//        dst[i+15:i] := 0
-//      ELSE
-//        dst[i+15:i] := a[i+15:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 {
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
 
-    // (b == 0) ? 0xFFFF : 0
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
 #if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
 #else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
-
-    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
 }
 
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 1
-//      i := j*32
-//      IF b[i+31:i] < 0
-//        dst[i+31:i] := -(a[i+31:i])
-//      ELSE IF b[i+31:i] == 0
-//        dst[i+31:i] := 0
-//      ELSE
-//        dst[i+31:i] := a[i+31:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 {
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
 
-    // (b == 0) ? 0xFFFFFFFF : 0
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
 #if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
 #else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
-
-    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
 }
 
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      IF b[i+7:i] < 0
-//        dst[i+7:i] := -(a[i+7:i])
-//      ELSE IF b[i+7:i] == 0
-//        dst[i+7:i] := 0
-//      ELSE
-//        dst[i+7:i] := a[i+7:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 {
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
 
-    // (b == 0) ? 0xFF : 0
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
 #if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
 #else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
-
-    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
 }
 
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 {
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
 }
 
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
-#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
-#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
 }
 
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Adds the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
-// Add the lower double-precision (64-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper element from
-// a to the upper element of dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
-FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_add_pd(a, b));
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
-// Add 64-bit integers a and b, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
 
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
 }
 
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
 }
 
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
 }
 
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
 }
 
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
 }
 
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-//   r0 := SignedSaturate(a0 + b0)
-//   r1 := SignedSaturate(a1 + b1)
-//   ...
-//   r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return !_mm_comieq_sd(a, b);
 }
 
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
 //
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
 }
 
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 {
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
 }
 
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
 //
-//   r0 := (a0 * b0)[15:0]
-//   r1 := (a1 * b1)[15:0]
-//   ...
-//   r7 := (a7 * b7)[15:0]
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 {
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 }
 
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
 }
 
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
 //
-//   FOR j := 0 to 3
-//      i := j*16
-//      tmp[31:0] := a[i+15:i] * b[i+15:i]
-//      dst[i+15:i] := tmp[31:16]
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
 //   ENDFOR
+//   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
 
-// Multiplies the four single-precision, floating-point values of a and b.
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
 //
-//   r0 := a0 * b0
-//   r1 := a1 * b1
-//   r2 := a2 * b2
-//   r3 := a3 * b3
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
 }
 
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
 #endif
 }
 
-// Multiply the lower double-precision (64-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper element
-// from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
-FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-    return _mm_move_sd(a, _mm_mul_pd(a, b));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
 }
 
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
 //
-//   dst[31:0] := a[31:0] * b[31:0]
-//   dst[127:32] := a[127:32]
+//   dst[63:0] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
 }
 
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
 //
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
 }
 
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
 //
-//   dst[63:0] := a[31:0] * b[31:0]
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
 }
 
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
 //
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
 }
 
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
+// Copy the lower 32-bit integer in a to dst.
 //
-//   r0 := (a0 * b0) + (a1 * b1)
-//   r1 := (a2 * b2) + (a3 * b3)
-//   r2 := (a4 * b4) + (a5 * b5)
-//   r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 {
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
 }
 
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
+// Copy the lower 64-bit integer in a to dst.
 //
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-//   ...
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 {
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
 }
 
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*16
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-//      a[i+7:i]*b[i+7:i] )
-//   ENDFOR
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+// Copy the lower 64-bit integer in a to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
 #if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
-// Computes the fused multiple add product of 32-bit floating point numbers.
+// Copy the lower 64-bit integer in a to dst.
 //
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    return _mm_add_ps(_mm_mul_ps(a, b), c);
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 {
-    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
-    return _mm_fmadd_ps(b, mask, a);
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
 }
 
-// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
-FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
 #if defined(__aarch64__)
     return vreinterpretq_m128d_f64(
-        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
-    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
-    return (__m128i) vsetq_lane_u16(r4, r, 4);
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    uint16x4_t t =
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
 }
 
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
 //
-//   FOR j := 0 to 7
-//      i := j*8
-//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
-//   ENDFOR
-//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
-//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
 
-// Divides the four single-precision, floating-point values of a and b.
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
 //
-//   r0 := a0 / b0
-//   r1 := a1 / b1
-//   r2 := a2 / b2
-//   r3 := a3 / b3
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
 //
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+    double ret = *((double *) &a);
+    return (int64_t) ret;
 #endif
 }
 
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
 
 // Divide packed double-precision (64-bit) floating-point elements in a by
 // packed elements in b, and store the results in dst.
@@ -3875,266 +4340,230 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 #endif
 }
 
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
 #endif
-    return vreinterpretq_m128_f32(recip);
 }
 
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
 //
-//   dst[31:0] := (1.0 / a[31:0])
-//   dst[127:32] := a[127:32]
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
 
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
 //
-//   r0 := sqrt(a0)
-//   r1 := sqrt(a1)
-//   r2 := sqrt(a2)
-//   r3 := sqrt(a3)
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
-    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
-    const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
-    recip = vreinterpretq_f32_u32(
-        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-
-    // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
 #endif
 }
 
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 {
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
 }
 
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-#if SSE2NEON_PRECISE_RSQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
 #endif
-    return vreinterpretq_m128_f32(out);
 }
 
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
 }
 
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 {
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
 }
 
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
 //
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
 #endif
 }
 
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
 //
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
 }
 
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 {
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+    return _mm_load_pd(p);
 }
 
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
 }
 
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
+// Load unaligned 32-bit integer from memory into the first element of dst.
 //
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
 }
 
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
 
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
 {
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
 }
 
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
 {
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
@@ -4146,13 +4575,41 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {fmax(da[0], db[0]), da[1]};
+    return vld1q_f32((float32_t *) c);
+#endif
 }
 
 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
@@ -4164,110 +4621,246 @@ FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
 }
 
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {fmin(da[0], db[0]), da[1]};
+    return vld1q_f32((float32_t *) c);
+#endif
 }
 
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
 {
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
 }
 
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
 }
 
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
 //
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 > b0) ? a0 : b0
-//   r1 := (a1 > b1) ? a1 : b1
-//   r2 := (a2 > b2) ? a2 : b2
-//   r3 := (a3 > b3) ? a3 : b3
+//   dst[63:0] := a[63:0]
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
 {
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
 }
 
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
 //
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 < b0) ? a0 : b0
-//   r1 := (a1 < b1) ? a1 : b1
-//   r2 := (a2 < b2) ? a2 : b2
-//   r3 := (a3 < b3) ? a3 : b3
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
 {
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
 }
 
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
 }
 
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
 }
 
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
 {
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
 }
 
 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
@@ -4321,1341 +4914,2413 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
 #endif
 }
 
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
 {
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Horizontally substract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsubq_f32(
-        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
-        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
-#else
-    float32x4x2_t c =
-        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
 {
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
 }
 
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
 {
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
 }
 
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 {
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Subtract
-    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
 }
 
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
 {
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
+    __asm__ __volatile__("isb\n");
 }
 
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated subtract
-    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
-#endif
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
 }
 
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
 {
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
 }
 
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
 {
-    int64x2_t a = vreinterpretq_s64_m128i(_a);
-    int64x2_t b = vreinterpretq_s64_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|b0|b2]
-    // [a1|a2|b1|b3]
-    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
-    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
-    // Subtract
-    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
 }
 
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
 {
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
 }
 
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
 {
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
 }
 
-/* Compare operations */
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
 
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
 }
 
-// Compares for less than
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+    return _mm_set_pd(0, a);
 }
 
-// Compares for greater than.
+// Sets the 8 signed 16-bit integer values to w.
 //
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
 {
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
 }
 
-// Compares for greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
 {
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
 }
 
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
 {
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
 }
 
-// Compares for greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
 {
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 }
 
-// Compares for less than or equal.
+// Sets the 16 signed 8-bit integer values to b.
 //
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 {
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
 }
 
-// Compares for less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
 }
 
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
 {
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
 }
 
-// Compares for equality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
 {
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
 }
 
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
 {
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
 }
 
-// Compares for inequality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
 {
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
 }
 
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 {
-    return _mm_cmplt_ps(a, b);
+    return _mm_set_pd(e0, e1);
 }
 
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-    return _mm_cmplt_ss(a, b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
 }
 
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
 {
-    return _mm_cmple_ps(a, b);
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
 }
 
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
 {
-    return _mm_cmple_ss(a, b);
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
 }
 
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
 {
-    return _mm_cmpgt_ps(a, b);
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
 }
 
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
 {
-    return _mm_cmpgt_ss(a, b);
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
 }
 
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
 {
-    return _mm_cmpge_ps(a, b);
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
 }
 
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
 {
-    return _mm_cmpge_ss(a, b);
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
 }
 
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] << (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
+FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
 {
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
     return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+        vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
 }
 
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
-FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
 #endif
 }
 
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
 }
 
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
 {
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
 }
 
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
 }
 
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
 //
-//   r0 := (a0 > b0) ? 0xff : 0x0
-//   r1 := (a1 > b1) ? 0xff : 0x0
-//   ...
-//   r15 := (a15 > b15) ? 0xff : 0x0
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
 //
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (_sse2neon_unlikely((imm) == 0)) {                              \
+            ret = a;                                                       \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {            \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
 {
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
 }
 
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
 //
-//   r0 := (a0 < b0) ? 0xffff : 0x0
-//   r1 := (a1 < b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 < b7) ? 0xffff : 0x0
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//   ENDFOR
 //
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
 {
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
 }
 
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
 //
-//   r0 := (a0 > b0) ? 0xffff : 0x0
-//   r1 := (a1 > b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 > b7) ? 0xffff : 0x0
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
+//     FI
+//   ENDFOR
 //
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 {
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
 }
 
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (_sse2neon_unlikely(imm & ~15)) {                               \
+            ret = _mm_setzero_si128();                                     \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_u16(                                 \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (_sse2neon_unlikely(imm & ~31)) {                               \
+            ret = _mm_setzero_si128();                                     \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_u32(                                 \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+        }                                                                  \
+        ret;                                                               \
+    })
 
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (_sse2neon_unlikely(imm & ~63)) {                               \
+            ret = _mm_setzero_si128();                                     \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_u64(                                 \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] >> (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
+FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
 {
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
 }
 
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
 }
 
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
 #else
-    // ARMv7 lacks vcgtq_s64.
-    // This is based off of Clang's SSE2 polyfill:
-    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
-
-    // Mask the sign bit out since we need a signed AND an unsigned comparison
-    // and it is ugly to try and split them.
-    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
-    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
-    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
-    // Check if a > b
-    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi > b_hi
-    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
-    // Copy lower mask to upper mask
-    // a_lo > b_lo
-    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
-    // Compare for equality
-    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi == b_hi
-    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
-    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
-    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
-    return vreinterpretq_m128i_s64(ret);
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
 #endif
 }
 
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
 }
 
-// Compares for ordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 {
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }
 
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
 }
 
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 {
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
 {
-    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
 {
-    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+    _mm_store_pd(mem_addr, a);
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+// Stores 128-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
 {
-    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+// Stores 32-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
 {
-    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
-    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-    uint32x4_t a_neq_b = vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
 }
 
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
 
-/* Conversions */
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
 
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
 //
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
 //
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
 }
 
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
 //
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
+//   dst[63:0] := a[63:0] - b[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
 
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
 //
-//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
-//   dst[127:32] := a[127:32]
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t diff = data - floor(data);
-    if (diff > 0.5)
-        return (int32_t) ceil(data);
-    if (unlikely(diff == 0.5)) {
-        int32_t f = (int32_t) floor(data);
-        int32_t c = (int32_t) ceil(data);
-        return c & 1 ? f : c;
-    }
-    return (int32_t) floor(data);
-#endif
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
 //
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
-//   ENDFOR
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
 //
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
 }
 
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then covert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
 //
-//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
-//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
 }
 
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
 //
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
-//   ENDFOR
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
 }
 
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
 //
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
-//   ENDFOR
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
 }
 
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
 //
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
-//   ENDFOR
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
 }
 
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
 }
 
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
 //
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
 #if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
 #endif
 }
 
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
 //
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
 }
 
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
 {
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 {
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+//   i := j*64
+//   IF ((j & 1) == 0)
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ELSE
+//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
+//   FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
+    __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
 }
 
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
+    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
 }
 
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
 }
 
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
 }
 
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
 {
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsubq_f64(
+        vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
+        vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
 }
 
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsubq_f32(
+        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+    float32x4x2_t c =
+        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
 }
 
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+#if (__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
 }
 
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
 }
 
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
 }
 
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 {
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
 }
 
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
 //
-//   r0 := (int) a0
-//   r1 := (int) a1
-//   r2 := (int) a2
-//   r3 := (int) a3
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-#else
-    uint32x4_t signmask = vdupq_n_u32(0x80000000);
-    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
-    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-    int32x4_t r_trunc =
-        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-    float32x4_t delta = vsubq_f32(
-        vreinterpretq_f32_m128(a),
-        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
-    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
 }
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 16-bit integers, and store the results in dst. Note: this intrinsic
-// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
-// 0x7FFFFFFF.
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
-FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 {
-    return vreinterpret_m64_s16(
-        vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
 }
 
-// Copy the lower 32-bit integer in a to dst.
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
 //
-//   dst[31:0] := a[31:0]
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
 {
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
 }
 
-// Copy the lower 64-bit integer in a to dst.
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
 //
-//   dst[63:0] := a[63:0]
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
 {
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
 }
 
-// Copy the lower 64-bit integer in a to dst.
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
 //
-//   dst[63:0] := a[63:0]
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
 
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
 //
-//   r0 := a
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
 //
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
 {
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    int idx;
+    uint8x16_t tmp[2];
+    if (imm >= 16) {
+        idx = imm - 16;
+        tmp[0] = vreinterpretq_u8_m128i(a);
+        tmp[1] = vdupq_n_u8(0);
+    } else {
+        idx = imm;
+        tmp[0] = vreinterpretq_u8_m128i(b);
+        tmp[1] = vreinterpretq_u8_m128i(a);
+    }
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
 }
 
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
 //
-//   r0 := a
-//   r1 := 0x0
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if (imm >= 8) {                                                 \
+                const int idx = imm - 8;                                    \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = imm;                                        \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
 }
 
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
 }
 
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
 {
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
 }
 
-// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
-    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
 #endif
 }
 
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
 }
 
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
 }
 
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int32x4_t ab =
+        vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
+
+    int16x4_t ab_low_bits = vmovn_s32(ab);
+    int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
+
+    return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+    return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+    int32x2x2_t trn_ab =
+        vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
+    return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
 #endif
 }
 
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
 
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+    return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
 #endif
 }
 
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
 
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
 
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
 }
 
-// Load unaligned 32-bit integer from memory into the first element of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[MAX:32] := 0
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
 }
 
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
 //
-//   FOR j := 0 to 1
-//     i := 32*j
-//     k := 64*j
-//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   FOR j := 0 to 7
+//     i := j*8
+//     IF b[i+7] == 1
+//       dst[i+7:i] := 0
+//     ELSE
+//       index[2:0] := b[i+2:i]
+//       dst[i+7:i] := a[index*8+7:index*8]
+//     FI
 //   ENDFOR
-//   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
 {
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
 #if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
 #endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
 }
 
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
 //
-//   dst[63:0] := a[63:0]
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
 {
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
 #if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
-    return ((double *) &a)[0];
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
 #endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
 }
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
 //
-//   FOR j := 0 to 1
-//     i := 64*j
-//     k := 32*j
-//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
 {
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
 #endif
-}
 
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
 }
 
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
 {
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
 }
 
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 {
-    // Use a signed shift right to create a mask with the sign bit
-    uint32x4_t mask =
-        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
 }
 
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t ones = 0xffff;                                         \
+        const uint16_t zeros = 0x0000;                                        \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros,         \
+                                   ((imm) & (1 << 1)) ? ones : zeros,         \
+                                   ((imm) & (1 << 2)) ? ones : zeros,         \
+                                   ((imm) & (1 << 3)) ? ones : zeros,         \
+                                   ((imm) & (1 << 4)) ? ones : zeros,         \
+                                   ((imm) & (1 << 5)) ? ones : zeros,         \
+                                   ((imm) & (1 << 6)) ? ones : zeros,         \
+                                   ((imm) & (1 << 7)) ? ones : zeros};        \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
 // Blend packed single-precision (32-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
@@ -5672,6 +7337,27 @@ FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
 }
 
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
 // Blend packed double-precision (64-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
@@ -5690,154 +7376,55 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 #endif
 }
 
-typedef struct {
-    uint16_t res0;
-    uint8_t res1 : 6;
-    uint8_t bit22 : 1;
-    uint8_t bit23 : 1;
-    uint8_t res2;
-#if defined(__aarch64__)
-    uint32_t res3;
-#endif
-} fpcr_bitfield;
-
-// Macro: Set the rounding mode bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The rounding mode may contain any of
-// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
-// _MM_ROUND_TOWARD_ZERO
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
-        break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
-    }
-
-#if defined(__aarch64__)
-    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-FORCE_INLINE void _mm_setcsr(unsigned int a)
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 {
-    _MM_SET_ROUNDING_MODE(a);
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
 }
 
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
 // floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
 #if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    float *v_float = (float *) &a;
-    __m128 zero, neg_inf, pos_inf;
-
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
-                        floorf(v_float[2]), floorf(v_float[3])};
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
-                        ceilf(v_float[3])};
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
-        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
-                             floorf(v_float[2]), floorf(v_float[3]));
-        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
-                             ceilf(v_float[2]), ceilf(v_float[3]));
-        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
-                        roundf(v_float[2]), roundf(v_float[3])};
-    }
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
 #endif
 }
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//       i := 32*j
-//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
 #if defined(__aarch64__)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
-    return vreinterpret_m64_s32(
-        vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
-            _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
 #endif
 }
 
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//       i := 32*j
-//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
-#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
 {
-    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+    return _mm_move_sd(a, _mm_ceil_pd(b));
 }
 
 // Round the lower single-precision (32-bit) floating-point element in b up to
@@ -5851,396 +7438,442 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 {
-    return _mm_move_ss(
-        a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+    return _mm_move_ss(a, _mm_ceil_ps(b));
 }
 
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
 }
 
-// Round the lower single-precision (32-bit) floating-point element in b down to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-//
-//   dst[31:0] := FLOOR(b[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
-FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
 {
-    return _mm_move_ss(
-        a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
 }
 
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
 
-/* Miscellaneous Operations */
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
 
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   ...
-//   r7 := a7 >> count
-//
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
 {
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (unlikely(c > 15))
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
 }
 
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   r2 := a2 >> count
-//   r3 := a3 >> count
-//
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
 {
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (unlikely(c > 31))
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
 }
 
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
 {
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
 }
 
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   ...
-//   r7 := UnsignedSaturate(a7)
-//   r8 := UnsignedSaturate(b0)
-//   r9 := UnsignedSaturate(b1)
-//   ...
-//   r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
 {
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
 }
 
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-//   r0 := SignedSaturate(a0)
-//   r1 := SignedSaturate(a1)
-//   r2 := SignedSaturate(a2)
-//   r3 := SignedSaturate(a3)
-//   r4 := SignedSaturate(b0)
-//   r5 := SignedSaturate(b1)
-//   r6 := SignedSaturate(b2)
-//   r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
 {
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
 }
 
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   r2 := UnsignedSaturate(a2)
-//   r3 := UnsignedSaturate(a3)
-//   r4 := UnsignedSaturate(b0)
-//   r5 := UnsignedSaturate(b1)
-//   r6 := UnsignedSaturate(b2)
-//   r7 := UnsignedSaturate(b3)
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
 {
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
 }
 
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   ...
-//   r14 := a7
-//   r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
 }
 
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   r4 := a2
-//   r5 := b2
-//   r6 := a3
-//   r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
 }
 
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 {
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
 #if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
 #else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
 #endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
 }
 
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
 }
 
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
 #endif
 }
 
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-//
-//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
-//     dst[63:0] := src1[63:0]
-//     dst[127:64] := src2[63:0]
-//     RETURN dst[127:0]
-//   }
-//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
-FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
-                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
 #endif
 }
 
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the high half of a and b, and store the results in dst.
-//
-//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
-//     dst[63:0] := src1[127:64]
-//     dst[127:64] := src2[127:64]
-//     RETURN dst[127:0]
-//   }
-//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
-FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
-                     vget_high_s64(vreinterpretq_s64_m128d(b))));
-#endif
+    return _mm_move_sd(a, _mm_floor_pd(b));
 }
 
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
 //
-//   r0 := a2
-//   r1 := b2
-//   r2 := a3
-//   r3 := b3
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
 //
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
+    return _mm_move_ss(a, _mm_floor_ps(b));
 }
 
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
 //
-//   r0 := a8
-//   r1 := b8
-//   r2 := a9
-//   r3 := b9
-//   ...
-//   r14 := a15
-//   r15 := b15
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
 //
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
     return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
 //
-//   r0 := a4
-//   r1 := b4
-//   r2 := a5
-//   r3 := b5
-//   r4 := a6
-//   r5 := b6
-//   r6 := a7
-//   r7 := b7
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
 //
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-//   r0 := a1
-//   r1 := b1
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
 {
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
 }
 
 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
@@ -6296,6 +7929,339 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
     return dst;
 }
 
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
@@ -6312,6 +8278,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
 // otherwise set CF to 0. Return the ZF value.
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
@@ -6321,299 +8295,93 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
+/* SSE4.2 */
 
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
 #if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
 #endif
 }
 
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
 {
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
 #else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
 #endif
+    return crc;
 }
 
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-/* Crypto Extensions */
-
-#if defined(__ARM_FEATURE_CRYPTO)
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
 {
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
 #else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
 #endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+    return crc;
+}
 
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
 #else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
 #endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
+    return crc;
 }
-#endif  // ARMv7 polyfill
 
-// Perform a carry-less multiplication of two 64-bit integers, selected from a
-// and b according to imm8, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 {
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
     }
+#endif
+    return crc;
 }
 
+/* AES */
+
 #if !defined(__ARM_FEATURE_CRYPTO)
 /* clang-format off */
 #define SSE2NEON_AES_DATA(w)                                           \
@@ -6752,22 +8520,22 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 {
     /* FIXME: optimized for NEON */
     uint8_t v[4][4] = {
-        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
-        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
-        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
-        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
     };
     for (int i = 0; i < 16; i++)
         vreinterpretq_nth_u8_m128i(a, i) =
@@ -6833,155 +8601,134 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 }
 #endif
 
-/* Streaming Extensions */
+/* Others */
 
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
 {
-    __sync_synchronize();
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
 }
 
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
 {
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
 #else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
+        uint32_t value;
 #endif
-}
+    } r;
 
-// Stores the data in a to the address p without polluting the caches.  If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
 }
 
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
 #else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
 #endif
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-    // no corollary for Neon?
-}
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
 
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
 
-// Free aligned memory that was allocated with _mm_malloc.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
 }
 
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
 #else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
 #endif
-    return crc;
 }
 
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
 #else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+        uint32_t value;
 #endif
-    return crc;
-}
+    } r;
 
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
-    return crc;
-}
 
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
 #else
-    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
-    return crc;
 }
 
 #if defined(__GNUC__) || defined(__clang__)
@@ -6993,4 +8740,4 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
 #pragma GCC pop_options
 #endif
 
-#endif
+#endif
+\ No newline at end of file
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
index 195506b530..34e37b08b1 100644
--- a/thirdparty/embree/common/simd/simd.h
+++ b/thirdparty/embree/common/simd/simd.h
@@ -6,7 +6,7 @@
 #include "../math/math.h"
 
 /* include SSE wrapper classes */
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
 #  include "sse.h"
 #endif
 
diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h
index 1465fb4fb0..04d90533dd 100644
--- a/thirdparty/embree/common/simd/sse.h
+++ b/thirdparty/embree/common/simd/sse.h
@@ -11,7 +11,7 @@
 
 namespace embree 
 {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
     return _mm_blendv_ps(f,t,mask);
   }
diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h
index 7db0d1c5c1..450bd7a4eb 100644
--- a/thirdparty/embree/common/simd/vboold4_avx.h
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -62,7 +62,11 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+#if !defined(__aarch64__)
     __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
+#endif
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -107,9 +111,10 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__aarch64__)
   __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
   __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
-
+#endif
 
 #if defined(__AVX2__)
   template<int i0, int i1, int i2, int i3>
diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h
index 19841dcea8..86b718f025 100644
--- a/thirdparty/embree/common/simd/vboolf16_avx512.h
+++ b/thirdparty/embree/common/simd/vboolf16_avx512.h
@@ -116,7 +116,7 @@ namespace embree
   __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
   
   ////////////////////////////////////////////////////////////////////////////////
-  /// Convertion Operations
+  /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
index fa84b1b6ee..9e0fdf5c6f 100644
--- a/thirdparty/embree/common/simd/vboolf4_sse2.h
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -36,9 +36,11 @@ namespace embree
 
     __forceinline vboolf(__m128 input) : v(input) {}
     __forceinline operator const __m128&() const { return v; }
+    #if !defined(__EMSCRIPTEN__)
     __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
     __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
-    
+    #endif
+
     __forceinline vboolf(bool a)
       : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
     __forceinline vboolf(bool a, bool b)
@@ -100,7 +102,7 @@ namespace embree
   __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
   
   __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     return _mm_blendv_ps(f, t, m); 
 #else
     return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
@@ -114,6 +116,17 @@ namespace embree
   __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
   __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
 
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
     return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -123,6 +136,7 @@ namespace embree
   __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
     return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
+#endif
 
   template<int i0>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
@@ -135,7 +149,7 @@ namespace embree
   template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && !defined(__aarch64__)
   template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
@@ -157,7 +171,9 @@ namespace embree
   __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
   
   __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
-#if defined(__SSE4_2__)
+#if defined(__aarch64__)
+  __forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); }
+#elif defined(__SSE4_2__)
   __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
 #else
   __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h
index ba77cc3c5e..18cede19c6 100644
--- a/thirdparty/embree/common/simd/vboolf8_avx.h
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -76,7 +76,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
-    __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+    __forceinline vboolf(TrueTy)  : v(_mm256_castsi256_ps(_mm256_set1_epi32(0xFFFFFFFF))) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h
index 55326de7dd..208bb7ac99 100644
--- a/thirdparty/embree/common/simd/vdouble4_avx.h
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -189,13 +189,20 @@ namespace embree
   __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
   __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
   __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
-#else
+#elif !defined(__aarch64__)
   __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
   __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
   __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
   __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
   __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
   __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
 #endif
 
   __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h
index 9f1e2459c4..75c471cc0c 100644
--- a/thirdparty/embree/common/simd/vfloat16_avx512.h
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -177,9 +177,10 @@ namespace embree
   __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
   __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
 
-  __forceinline vfloat16 rcp(const vfloat16& a) {
+  __forceinline vfloat16 rcp(const vfloat16& a)
+  {
     const vfloat16 r = _mm512_rcp14_ps(a);
-    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+    return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r);  // computes r + r * (1 - a*r)
   }
 
   __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
index 5215bf9730..6d7e11fe72 100644
--- a/thirdparty/embree/common/simd/vfloat4_sse2.h
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -42,6 +42,11 @@ namespace embree
     __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
 
     __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
     __forceinline explicit vfloat(const vuint4& x) {
       const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
       const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
@@ -49,7 +54,7 @@ namespace embree
       const __m128  bf  = _mm_castsi128_ps(b);  
       v  = _mm_add_ps(af,bf);
     }
-
+#endif
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
@@ -107,7 +112,11 @@ namespace embree
 #endif
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const char* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const char* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -117,7 +126,11 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const unsigned char* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -128,7 +141,11 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const short* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -145,15 +162,19 @@ namespace embree
     static __forceinline void store_nt(void* ptr, const vfloat4& v)
     {
 #if defined (__SSE4_1__)
+#if defined(__aarch64__)
       _mm_stream_ps((float*)ptr,v);
 #else
+      _mm_stream_ps((float*)ptr,v);
+#endif
+#else
       _mm_store_ps((float*)ptr,v);
 #endif
     }
 
     template<int scale = 4>
     static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_ps(ptr, index, scale);
 #else
       return vfloat4(
@@ -169,7 +190,7 @@ namespace embree
       vfloat4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__)  && !defined(__aarch64__)
       return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
 #else
       if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
@@ -223,8 +244,8 @@ namespace embree
     friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
 #if defined(__AVX512VL__)
       return _mm_mask_blend_ps(m, f, t);
-#elif defined(__SSE4_1__)
-      return _mm_blendv_ps(f, t, m); 
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
 #else
       return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
 #endif
@@ -256,18 +277,34 @@ namespace embree
   __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
 
   __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
   __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
   __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
 #if defined(__AVX512VL__)
   __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
 #else
   __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
 #endif
+
   __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
-  
+
   __forceinline vfloat4 rcp(const vfloat4& a)
   {
+#if defined(__aarch64__)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else
+
 #if defined(__AVX512VL__)
     const vfloat4 r = _mm_rcp14_ps(a);
 #else
@@ -275,30 +312,39 @@ namespace embree
 #endif
 
 #if defined(__AVX2__)
-    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+    return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r);                    // computes r + r * (1 - a * r)
 #else
-    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r))));  // computes r + r * (1 - a * r)
 #endif
+
+#endif  //defined(__aarch64__)
   }
   __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
   __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
 
   __forceinline vfloat4 rsqrt(const vfloat4& a)
   {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
 #if defined(__AVX512VL__)
     vfloat4 r = _mm_rsqrt14_ps(a);
 #else
     vfloat4 r = _mm_rsqrt_ps(a);
 #endif
 
-#if defined(__ARM_NEON)
-    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-#elif defined(__AVX2__)
+#if defined(__AVX2__)
     r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 #else
     r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 #endif
+
+#endif
     return r;
   }
 
@@ -344,7 +390,8 @@ namespace embree
   __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
   __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
     __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -393,9 +440,10 @@ namespace embree
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
 #else
   __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
-  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
   __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -429,8 +477,13 @@ namespace embree
   __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
   __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
   __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
   __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
   __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
   __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
 #endif
 
@@ -484,7 +537,7 @@ namespace embree
     return select(vboolf4(mask), t, f);
 #endif
   }
-  
+
   __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
     return madd(t,b-a,a);
   }
@@ -506,10 +559,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__aarch64__)
-  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
-  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
-  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
-  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
 #elif defined (__SSE4_1__)
   __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
@@ -524,7 +577,9 @@ namespace embree
   __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
 
   __forceinline vint4 floori(const vfloat4& a) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
     return vint4(floor(a));
 #else
     return vint4(a-vfloat4(0.5f));
@@ -538,6 +593,16 @@ namespace embree
   __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
   __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
 
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat4 shuffle(const vfloat4& v) {
     return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -547,8 +612,9 @@ namespace embree
   __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
     return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
+#endif
 
-#if defined(__SSE3__)
+#if defined(__SSE3__) && !defined(__aarch64__)
   template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
   template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
   template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
@@ -559,10 +625,14 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; }
+#else
   template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
   template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
 
-#if defined (__SSE4_1__)
+#if defined (__SSE4_1__) && !defined(__aarch64__)
   template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
@@ -664,14 +734,25 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if defined(__aarch64__)
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
   __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
   __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
   __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
   __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
 
   __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
   { 
@@ -687,7 +768,7 @@ namespace embree
   }
   
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline float dot(const vfloat4& a, const vfloat4& b) {
diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h
index 13446454e8..b09d5e641d 100644
--- a/thirdparty/embree/common/simd/vfloat8_avx.h
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -107,11 +107,11 @@ namespace embree
     static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
     static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
 #else
-    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
-    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
 #endif
     
 #if defined(__AVX2__)
@@ -126,7 +126,7 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm256_i32gather_ps(ptr, index ,scale);
 #else
       return vfloat8(
@@ -146,7 +146,7 @@ namespace embree
       vfloat8 r = zero;
 #if defined(__AVX512VL__)
       return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
 #else
       if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
@@ -215,20 +215,52 @@ namespace embree
   __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
 
   __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+#if !defined(__aarch64__)
   __forceinline vfloat8 operator -(const vfloat8& a) {
     const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
     return _mm256_xor_ps(a, mask);
   }
+#else
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+      __m256 res;
+      res.lo = vnegq_f32(a.v.lo);
+      res.hi = vnegq_f32(a.v.hi);
+      return res;
+}
+#endif
+
+#if !defined(__aarch64__)
   __forceinline vfloat8 abs(const vfloat8& a) {
     const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
     return _mm256_and_ps(a, mask);
   }
+#else
+__forceinline vfloat8 abs(const vfloat8& a) {
+    __m256 res;
+    res.lo = vabsq_f32(a.v.lo);
+    res.hi = vabsq_f32(a.v.hi);
+    return res;
+}
+#endif
+
+#if !defined(__aarch64__)
   __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#else
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+#endif
   __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
 
 
   static __forceinline vfloat8 rcp(const vfloat8& a)
   {
+#if defined(__aarch64__)
+    vfloat8 ret;
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    ret.v.lo = vdivq_f32(one, a.v.lo);
+    ret.v.hi = vdivq_f32(one, a.v.hi);
+    return ret;
+#endif
+
 #if defined(__AVX512VL__)
     const vfloat8 r = _mm256_rcp14_ps(a);
 #else
@@ -236,9 +268,12 @@ namespace embree
 #endif
 
 #if defined(__AVX2__)
-    return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+    // First, compute 1 - a * r (which will be very close to 0)
+    const vfloat8 h_n = _mm256_fnmadd_ps(a, r, vfloat8(1.0f));
+    // Then compute r + r * h_n
+    return _mm256_fmadd_ps(r, h_n, r);
 #else
-    return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+    return _mm256_add_ps(r,_mm256_mul_ps(r, _mm256_sub_ps(vfloat8(1.0f), _mm256_mul_ps(a, r))));  // computes r + r * (1 - a * r)
 #endif
   }
   __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
@@ -384,7 +419,7 @@ namespace embree
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
     return _mm256_mask_blend_ps(m, f, t);
   }
-#else
+#elif !defined(__aarch64__)
   static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
   static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
   static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
@@ -395,6 +430,18 @@ namespace embree
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
     return _mm256_blendv_ps(f, t, m); 
   }
+#else
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m);
+  }
+
 #endif
 
   template<int mask>
@@ -463,10 +510,17 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__aarch64__)
   __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
   __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
   __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+#endif
+
+
   __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -501,9 +555,11 @@ namespace embree
     return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
+#if !defined(__aarch64__)
   template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
   template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
   template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#endif
 
   __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
   template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
@@ -512,7 +568,7 @@ namespace embree
 
   __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
 
-#if defined (__AVX2__)
+#if defined (__AVX2__) && !defined(__aarch64__)
   static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
     return _mm256_permutevar8x32_ps(a, index);
   }
@@ -609,7 +665,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if !defined(__aarch64__)
   __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
   __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
   __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
@@ -625,7 +681,14 @@ namespace embree
   __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
   __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
   __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+#else
+  __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
+  __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
+  __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
+  __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
 
+#endif
   __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
   { 
     const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
@@ -642,7 +705,7 @@ namespace embree
 
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators (pairs of Vec3fa's)
+  /// Euclidean Space Operators (pairs of Vec3fa's)
   ////////////////////////////////////////////////////////////////////////////////
 
   //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
index 9814d5c71c..eea03a771e 100644
--- a/thirdparty/embree/common/simd/vint4_sse2.h
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -106,7 +106,14 @@ namespace embree
 #endif
 
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vint4 load(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+        return  _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vint4 load(const unsigned char* ptr) {
       return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
@@ -127,7 +134,9 @@ namespace embree
 #endif
 
     static __forceinline vint4 load(const unsigned short* ptr) {
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      return __m128i(vmovl_u16(vld1_u16(ptr)));
+#elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
 #else
       return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
@@ -135,7 +144,12 @@ namespace embree
     } 
 
     static __forceinline void store(unsigned char* ptr, const vint4& v) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+        int32x4_t x = v;
+        uint16x4_t y = vqmovn_u32(uint32x4_t(x));
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
       __m128i x = v;
       x = _mm_packus_epi32(x, x);
       x = _mm_packus_epi16(x, x);
@@ -147,20 +161,26 @@ namespace embree
     }
 
     static __forceinline void store(unsigned short* ptr, const vint4& v) {
+#if defined(__aarch64__)
+      uint32x4_t x = uint32x4_t(v.v);
+      uint16x4_t y = vqmovn_u32(x);
+      vst1_u16(ptr, y);
+#else
       for (size_t i=0;i<4;i++)
         ptr[i] = (unsigned short)v[i];
+#endif
     }
 
     static __forceinline vint4 load_nt(void* ptr) {
-#if defined(__SSE4_1__)
-      return _mm_stream_load_si128((__m128i*)ptr); 
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr);
 #else
       return _mm_load_si128((__m128i*)ptr); 
 #endif
     }
     
     static __forceinline void store_nt(void* ptr, const vint4& v) {
-#if defined(__SSE4_1__)
+#if !defined(__aarch64__) && defined(__SSE4_1__)
       _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
 #else
       _mm_store_si128((__m128i*)ptr,v);
@@ -169,7 +189,7 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vint4 gather(const int* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_epi32(ptr, index, scale);
 #else
       return vint4(
@@ -185,7 +205,7 @@ namespace embree
       vint4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
 #else
       if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
@@ -222,7 +242,7 @@ namespace embree
 #endif
     }
 
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
     static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
 #endif
 
@@ -236,6 +256,8 @@ namespace embree
     friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
 #if defined(__AVX512VL__)
       return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__aarch64__)
+      return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
 #elif defined(__SSE4_1__)
       return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
 #else
@@ -256,7 +278,9 @@ namespace embree
 
   __forceinline vint4 operator +(const vint4& a) { return a; }
   __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
-#if defined(__SSSE3__)
+#if defined(__aarch64__)
+  __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
+#elif defined(__SSSE3__)
   __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
 #endif
 
@@ -272,7 +296,7 @@ namespace embree
   __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
   __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
   __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
 #else
   __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
@@ -292,8 +316,8 @@ namespace embree
   __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
   __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
 
-  __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
-  __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
 
   __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
   __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
@@ -309,7 +333,7 @@ namespace embree
   __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
   __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
   __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
   __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
 #endif
@@ -393,7 +417,7 @@ namespace embree
 #endif    
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
   __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
 
@@ -417,6 +441,16 @@ namespace embree
   __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
   __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
 
+#if defined(__aarch64__)
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& v) {
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+    }
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+        return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+    }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vint4 shuffle(const vint4& v) {
     return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
@@ -426,7 +460,7 @@ namespace embree
   __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
     return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
-
+#endif
 #if defined(__SSE3__)
   template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
   template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@@ -438,7 +472,7 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && !defined(__aarch64__)
   template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
   template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
 #else
@@ -446,18 +480,27 @@ namespace embree
   template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-
   template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
-
+  
   __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
-
-  __forceinline size_t toSizeT(const vint4& v) { 
+  
+#if defined(__aarch64__)
+  __forceinline size_t toSizeT(const vint4& v) {
+    uint64x2_t x = uint64x2_t(v.v);
+    return x[0];
+  }
+#else
+__forceinline size_t toSizeT(const vint4& v) { 
 #if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
     return toScalar(v);
+#elif defined(__ARM_NEON)
+    // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
+    return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
 #else
     return _mm_cvtsi128_si64(v); 
 #endif
   }
+#endif
 
 #if defined(__AVX512VL__)
 
@@ -475,7 +518,17 @@ namespace embree
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
+
+#if defined(__aarch64__)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+#else
   __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
@@ -483,6 +536,7 @@ namespace embree
   __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
   __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
   __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+#endif
 
   __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
   __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
@@ -502,7 +556,7 @@ namespace embree
   /// Sorting networks
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
 
   __forceinline vint4 usort_ascending(const vint4& v)
   {
diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h
index f43e9a8c22..48f5a9b203 100644
--- a/thirdparty/embree/common/simd/vint8_avx.h
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -79,8 +79,8 @@ namespace embree
     static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
 
     static __forceinline void store_nt(void* ptr, const vint8& v) {
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h
index e04737ffbe..d48efac3f4 100644
--- a/thirdparty/embree/common/simd/vint8_avx2.h
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -393,6 +393,7 @@ namespace embree
 
   __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
 
+#if !defined(__aarch64__)
   __forceinline vint8 permute(const vint8& v, const __m256i& index) {
     return _mm256_permutevar8x32_epi32(v, index);
   }
@@ -410,6 +411,9 @@ namespace embree
 #endif
   }  
 
+#endif
+
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
index 0601b9ab80..f7817da6be 100644
--- a/thirdparty/embree/common/simd/vuint4_sse2.h
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -95,7 +95,14 @@ namespace embree
     static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vuint4 load(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vuint4 load(const unsigned char* ptr) {
       return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
@@ -107,7 +114,9 @@ namespace embree
 #endif
 
     static __forceinline vuint4 load(const unsigned short* ptr) {
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      return _mm_load4epu16_epi32(((__m128i*)ptr));
+#elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
 #else
       return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
@@ -115,7 +124,7 @@ namespace embree
     } 
 
     static __forceinline vuint4 load_nt(void* ptr) {
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
       return _mm_stream_load_si128((__m128i*)ptr); 
 #else
       return _mm_load_si128((__m128i*)ptr); 
@@ -123,8 +132,8 @@ namespace embree
     }
     
     static __forceinline void store_nt(void* ptr, const vuint4& v) {
-#if defined(__SSE4_1__)
-      _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); 
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
 #else
       _mm_store_si128((__m128i*)ptr,v);
 #endif
@@ -132,7 +141,7 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_epi32((const int*)ptr, index, scale);
 #else
       return vuint4(
@@ -148,7 +157,7 @@ namespace embree
       vuint4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
 #else
       if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
@@ -344,6 +353,16 @@ namespace embree
   __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
   __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
 
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& v) {
     return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
@@ -353,7 +372,7 @@ namespace embree
   __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
     return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
-
+#endif
 #if defined(__SSE3__)
   template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
   template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@@ -365,7 +384,7 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && !defined(__aarch64__)
   template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
   template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
 #else
@@ -373,7 +392,6 @@ namespace embree
   template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-
   template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
 
   __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h
index 589cd9d731..cb8b5158c1 100644
--- a/thirdparty/embree/common/simd/vuint8_avx.h
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -77,8 +77,8 @@ namespace embree
     static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
 
     static __forceinline void store_nt(void* ptr, const vuint8& v) {
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h
index 17b994522f..959143724b 100644
--- a/thirdparty/embree/common/simd/vuint8_avx2.h
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -385,6 +385,7 @@ namespace embree
 
   __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
 
+#if !defined(__aarch64__)
   __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
     return _mm256_permutevar8x32_epi32(v, index);
   }
@@ -401,6 +402,7 @@ namespace embree
     return _mm256_alignr_epi8(a, b, 4*i);
 #endif
   }  
+#endif // !defined(__aarch64__)
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
diff --git a/thirdparty/embree/common/simd/wasm/emulation.h b/thirdparty/embree/common/simd/wasm/emulation.h
new file mode 100644
index 0000000000..778ab4ae6a
--- /dev/null
+++ b/thirdparty/embree/common/simd/wasm/emulation.h
@@ -0,0 +1,13 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// According to https://emscripten.org/docs/porting/simd.html, _MM_SET_EXCEPTION_MASK and
+// _mm_setcsr are unavailable in WebAssembly.
+
+#define _MM_SET_EXCEPTION_MASK(x)
+
+__forceinline void _mm_setcsr(unsigned int)
+{
+}
diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h
index dd9190c52a..e96939b63d 100644
--- a/thirdparty/embree/common/sys/array.h
+++ b/thirdparty/embree/common/sys/array.h
@@ -59,8 +59,8 @@ namespace embree
 
       /********************** Iterators  ****************************/
 
-      __forceinline T* begin() const { return items; };
-      __forceinline T* end  () const { return items+M; };
+      __forceinline T* begin() const { return (T*)items; };
+      __forceinline T* end  () const { return (T*)items+M; };
 
 
       /********************** Capacity ****************************/
@@ -101,8 +101,8 @@ namespace embree
       __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
       __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
 
-      __forceinline T& front() const { assert(M > 0); return items[0]; };
-      __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+      __forceinline T& front() { assert(M > 0); return items[0]; };
+      __forceinline T& back () { assert(M > 0); return items[M-1]; };
 
       __forceinline       T* data()       { return items; };
       __forceinline const T* data() const { return items; };
@@ -139,7 +139,7 @@ namespace embree
     __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
     __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
 
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
     __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
     __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
 #endif
@@ -196,7 +196,7 @@ namespace embree
     __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
     __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
 
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
     __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
 #endif
 
diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h
index 37fc036291..c56513a2ed 100644
--- a/thirdparty/embree/common/sys/barrier.h
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -24,7 +24,7 @@ namespace embree
     BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
 
   public:
-    /*! intializes the barrier with some number of threads */
+    /*! initializes the barrier with some number of threads */
     void init(size_t count);
 
     /*! lets calling thread wait in barrier */
@@ -94,7 +94,7 @@ namespace embree
     LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
 
   public:
-    /*! intializes the barrier with some number of threads */
+    /*! initializes the barrier with some number of threads */
     void init(size_t threadCount);
     
     /*! thread with threadIndex waits in the barrier */
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
index ed8dd7d40a..2c2f6eccda 100644
--- a/thirdparty/embree/common/sys/intrinsics.h
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -13,6 +13,9 @@
 #include "../simd/arm/emulation.h"
 #else
 #include <immintrin.h>
+#if defined(__EMSCRIPTEN__)
+#include "../simd/wasm/emulation.h"
+#endif
 #endif
 
 #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
@@ -24,24 +27,26 @@
   #endif
 #endif
 
-#if defined(__LZCNT__)
+#if defined(__aarch64__)
   #if !defined(_lzcnt_u32)
-    #define _lzcnt_u32 __lzcnt32
+    #define _lzcnt_u32 __builtin_clz
   #endif
-  #if !defined(_lzcnt_u64)
-    #define _lzcnt_u64 __lzcnt64
+#else
+  #if defined(__LZCNT__)
+    #if !defined(_lzcnt_u32)
+      #define _lzcnt_u32 __lzcnt32
+    #endif
+    #if !defined(_lzcnt_u64)
+      #define _lzcnt_u64 __lzcnt64
+    #endif
   #endif
 #endif
 
 #if defined(__WIN32__)
-// -- GODOT start --
-#if !defined(NOMINMAX)
-// -- GODOT end --
-#define NOMINMAX
-// -- GODOT start --
-#endif
-#include "windows.h"
-// -- GODOT end --
+#  if !defined(NOMINMAX)
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
 #endif
 
 /* normally defined in pmmintrin.h, but we always need this */
@@ -69,7 +74,7 @@ namespace embree
   }
   
   __forceinline int bsf(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
@@ -77,7 +82,7 @@ namespace embree
   }
   
   __forceinline unsigned bsf(unsigned v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
@@ -118,7 +123,7 @@ namespace embree
 #endif
   
   __forceinline int bsr(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__)  && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
@@ -126,7 +131,7 @@ namespace embree
   }
   
   __forceinline unsigned bsr(unsigned v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
@@ -145,7 +150,7 @@ namespace embree
   
   __forceinline int lzcnt(const int x)
   {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _lzcnt_u32(x);
 #else
     if (unlikely(x == 0)) return 32;
@@ -214,15 +219,26 @@ namespace embree
 #elif defined(__X86_ASM__)
 
   __forceinline void __cpuid(int out[4], int op) {
-    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); 
+#if defined(__ARM_NEON)
+    if (op == 0) { // Get CPU name
+      out[0] = 0x41524d20;
+      out[1] = 0x41524d20;
+      out[2] = 0x41524d20;
+      out[3] = 0x41524d20;
+    }
+#else
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+#endif
   }
-  
+
+#if !defined(__ARM_NEON)
   __forceinline void __cpuid_count(int out[4], int op1, int op2) {
     asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 
   }
-  
 #endif
-  
+
+#endif
+
   __forceinline uint64_t read_tsc()  {
 #if defined(__X86_ASM__)
     uint32_t high,low;
@@ -235,30 +251,38 @@ namespace embree
   }
   
   __forceinline int bsf(int v) {
-#if defined(__AVX2__) 
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
     return _tzcnt_u32(v);
 #elif defined(__X86_ASM__)
     int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
 #else
     return __builtin_ctz(v);
 #endif
+#endif
   }
   
 #if defined(__64BIT__)
   __forceinline unsigned bsf(unsigned v) 
   {
-#if defined(__AVX2__) 
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
     return _tzcnt_u32(v);
 #elif defined(__X86_ASM__)
     unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
 #else
     return __builtin_ctz(v);
 #endif
+#endif
   }
 #endif
   
   __forceinline size_t bsf(size_t v) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 #if defined(__X86_64__)
     return _tzcnt_u64(v);
 #else
@@ -295,7 +319,7 @@ namespace embree
   }
   
   __forceinline int bsr(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
 #elif defined(__X86_ASM__)
     int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
@@ -304,7 +328,7 @@ namespace embree
 #endif
   }
   
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
   __forceinline unsigned bsr(unsigned v) {
 #if defined(__AVX2__) 
     return 31 - _lzcnt_u32(v);
@@ -317,7 +341,7 @@ namespace embree
 #endif
   
   __forceinline size_t bsr(size_t v) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 #if defined(__X86_64__)
     return 63 - _lzcnt_u64(v);
 #else
@@ -332,7 +356,7 @@ namespace embree
   
   __forceinline int lzcnt(const int x)
   {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _lzcnt_u32(x);
 #else
     if (unlikely(x == 0)) return 32;
@@ -341,18 +365,18 @@ namespace embree
   }
 
   __forceinline size_t blsr(size_t v) {
-#if defined(__AVX2__) 
-#if defined(__INTEL_COMPILER)
+#if defined(__AVX2__) && !defined(__aarch64__)
+  #if defined(__INTEL_COMPILER)
     return _blsr_u64(v);
+  #else
+    #if defined(__X86_64__)
+       return __blsr_u64(v);
+    #else
+       return __blsr_u32(v);
+    #endif
+  #endif
 #else
-#if defined(__X86_64__)
-    return __blsr_u64(v);
-#else
-    return __blsr_u32(v);
-#endif
-#endif
-#else
-    return v & (v-1);
+       return v & (v-1);
 #endif
   }
   
@@ -368,7 +392,7 @@ namespace embree
 #if defined(__X86_ASM__)
     int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
 #else
-    return (v | (v << i));
+    return (v | (1 << i));
 #endif
   }
   
@@ -376,7 +400,7 @@ namespace embree
 #if defined(__X86_ASM__)
     int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
 #else
-    return (v & ~(v << i));
+    return (v & ~(1 << i));
 #endif
   }
   
@@ -392,7 +416,7 @@ namespace embree
 #if defined(__X86_ASM__)
     size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
 #else
-    return (v | (v << i));
+    return (v | (1 << i));
 #endif
   }
   
@@ -400,7 +424,7 @@ namespace embree
 #if defined(__X86_ASM__)
     size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
 #else
-    return (v & ~(v << i));
+    return (v & ~(1 << i));
 #endif
   }
 
@@ -435,8 +459,8 @@ namespace embree
 #endif
 #endif
 
-#if defined(__SSE4_2__)
-  
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+
   __forceinline int popcnt(int in) {
     return _mm_popcnt_u32(in);
   }
@@ -483,14 +507,14 @@ namespace embree
 #endif
   }
 
-  __forceinline void prefetchL1EX(const void* ptr) { 
-    prefetchEX(ptr); 
+  __forceinline void prefetchL1EX(const void* ptr) {
+    prefetchEX(ptr);
   }
-  
-  __forceinline void prefetchL2EX(const void* ptr) { 
-    prefetchEX(ptr); 
+
+  __forceinline void prefetchL2EX(const void* ptr) {
+    prefetchEX(ptr);
   }
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
    __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
    __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
 #if defined(__X86_64__)
diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp
index 789feaf2d8..8212deaa49 100644
--- a/thirdparty/embree/common/sys/mutex.cpp
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -36,6 +36,7 @@ namespace embree
     MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
     assert(ok);
     delete (pthread_mutex_t*)mutex; 
+    mutex = nullptr;
   }
   
   void MutexSys::lock() 
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
index 4cb3626d92..26af6c582c 100644
--- a/thirdparty/embree/common/sys/mutex.h
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -7,6 +7,7 @@
 #include "intrinsics.h"
 #include "atomic.h"
 
+#define CPU_CACHELINE_SIZE 64
 namespace embree
 {
   /*! system mutex */
@@ -83,6 +84,11 @@ namespace embree
     atomic<bool> flag;
   };
 
+  class PaddedSpinLock : public SpinLock
+  {
+    private:
+      char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)];
+  };
   /*! safe mutex lock and unlock helper */
   template<typename Mutex> class Lock {
   public:
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
index 3e386c4944..728bf6ed7d 100644
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -92,16 +92,19 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef __WIN32__
-#define dll_export __declspec(dllexport)
-#define dll_import __declspec(dllimport)
+#  if defined(EMBREE_STATIC_LIB)
+#    define dll_export
+#    define dll_import
+#  else
+#    define dll_export __declspec(dllexport)
+#    define dll_import __declspec(dllimport)
+#  endif
 #else
-#define dll_export __attribute__ ((visibility ("default")))
-#define dll_import 
+#  define dll_export __attribute__ ((visibility ("default")))
+#  define dll_import
 #endif
 
-// -- GODOT start --
 #if defined(__WIN32__) && !defined(__MINGW32__)
-// -- GODOT end --
 #if !defined(__noinline)
 #define __noinline             __declspec(noinline)
 #endif
@@ -151,9 +154,7 @@
   #define DELETED  = delete
 #endif
 
-// -- GODOT start --
 #if !defined(likely)
-// -- GODOT end --
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 #define   likely(expr) (expr)
 #define unlikely(expr) (expr)
@@ -161,9 +162,7 @@
 #define   likely(expr) __builtin_expect((bool)(expr),true )
 #define unlikely(expr) __builtin_expect((bool)(expr),false)
 #endif
-// -- GODOT start --
 #endif
-// -- GODOT end --
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Error handling and debugging
@@ -252,6 +251,7 @@ __forceinline std::string toString(long long value) {
 #pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
 //#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
 #pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+#pragma warning(disable:4267) // conversion from 'size_t' to 'const int', possible loss of data
 //#pragma warning(disable:4355) // 'this' : used in base member initializer list
 //#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
 //#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
index f1a59e511e..c98f61fa53 100644
--- a/thirdparty/embree/common/sys/sysinfo.cpp
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -21,7 +21,11 @@ namespace embree
   
   std::string getPlatformName() 
   {
-#if defined(__LINUX__) && !defined(__64BIT__)
+#if defined(__ANDROID__) && !defined(__64BIT__)
+    return "Android (32bit)";
+#elif defined(__ANDROID__) && defined(__64BIT__)
+    return "Android (64bit)";
+#elif defined(__LINUX__) && !defined(__64BIT__)
     return "Linux (32bit)";
 #elif defined(__LINUX__) && defined(__64BIT__)
     return "Linux (64bit)";
@@ -248,9 +252,7 @@ namespace embree
 #if defined(__X86_ASM__)
   __noinline int64_t get_xcr0() 
   {
-// -- GODOT start --
-#if defined (__WIN32__) && !defined (__MINGW32__)
-// -- GODOT end --
+#if defined (__WIN32__) && !defined (__MINGW32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
     xcr0 = _xgetbv(0);
     return xcr0;
@@ -337,9 +339,24 @@ namespace embree
     if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
 
     return cpu_features;
-#elif defined(__ARM_NEON)
-    /* emulated features with sse2neon */
-    return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED;
+
+#elif defined(__ARM_NEON) || defined(__EMSCRIPTEN__)
+
+    int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
+    cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
+    cpu_features |= CPU_FEATURE_XMM_ENABLED;
+    cpu_features |= CPU_FEATURE_YMM_ENABLED;
+    cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
+    cpu_features |= CPU_FEATURE_POPCNT;
+    cpu_features |= CPU_FEATURE_AVX;
+    cpu_features |= CPU_FEATURE_AVX2;
+    cpu_features |= CPU_FEATURE_FMA3;
+    cpu_features |= CPU_FEATURE_LZCNT;
+    cpu_features |= CPU_FEATURE_BMI1;
+    cpu_features |= CPU_FEATURE_BMI2;
+    cpu_features |= CPU_FEATURE_NEON_2X;
+    return cpu_features;
+
 #else
     /* Unknown CPU. */
     return 0;
@@ -376,6 +393,8 @@ namespace embree
     if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
     if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
     if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    if (features & CPU_FEATURE_NEON) str += "NEON ";
+    if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
     return str;
   }
   
@@ -390,6 +409,9 @@ namespace embree
     if (isa == AVX) return "AVX";
     if (isa == AVX2) return "AVX2";
     if (isa == AVX512) return "AVX512";
+
+    if (isa == NEON) return "NEON";
+    if (isa == NEON_2X) return "2xNEON";
     return "UNKNOWN";
   }
 
@@ -410,6 +432,9 @@ namespace embree
     if (hasISA(features,AVXI)) v += "AVXI ";
     if (hasISA(features,AVX2)) v += "AVX2 ";
     if (hasISA(features,AVX512)) v += "AVX512 ";
+
+    if (hasISA(features,NEON)) v += "NEON ";
+    if (hasISA(features,NEON_2X)) v += "2xNEON ";
     return v;
   }
 }
@@ -613,6 +638,10 @@ namespace embree
 #include <sys/time.h>
 #include <pthread.h>
 
+#if defined(__EMSCRIPTEN__)
+#include <emscripten.h>
+#endif
+
 namespace embree
 {
   unsigned int getNumberOfLogicalThreads() 
@@ -620,12 +649,25 @@ namespace embree
     static int nThreads = -1;
     if (nThreads != -1) return nThreads;
 
-// -- GODOT start --
-// #if defined(__MACOSX__)
 #if defined(__MACOSX__) || defined(__ANDROID__)
-// -- GODOT end --
     nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
     assert(nThreads);
+#elif defined(__EMSCRIPTEN__)
+    // WebAssembly supports pthreads, but not pthread_getaffinity_np. Get the number of logical
+    // threads from the browser or Node.js using JavaScript.
+    nThreads = MAIN_THREAD_EM_ASM_INT({
+        const isBrowser = typeof window !== 'undefined';
+        const isNode = typeof process !== 'undefined' && process.versions != null &&
+            process.versions.node != null;
+        if (isBrowser) {
+            // Return 1 if the browser does not expose hardwareConcurrency.
+            return window.navigator.hardwareConcurrency || 1;
+        } else if (isNode) {
+            return require('os').cpus().length;
+        } else {
+            return 1;
+        }
+    });
 #else
     cpu_set_t set;
     if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h
index 72351d12e4..cefd39a0f6 100644
--- a/thirdparty/embree/common/sys/sysinfo.h
+++ b/thirdparty/embree/common/sys/sysinfo.h
@@ -55,7 +55,12 @@
 #  define isa sse
 #  define ISA SSE
 #  define ISA_STR "SSE"
-#else 
+#elif defined(__ARM_NEON)
+// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
+#define isa sse2
+#define ISA NEON
+#define ISA_STR "NEON"
+#else
 #error Unknown ISA
 #endif
 
@@ -133,7 +138,9 @@ namespace embree
   static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
   static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
   static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
- 
+  static const int CPU_FEATURE_NEON = 1 << 28;
+  static const int CPU_FEATURE_NEON_2X = 1 << 29;
+
   /*! get CPU features */
   int getCPUFeatures();
 
@@ -154,6 +161,8 @@ namespace embree
   static const int AVXI   = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
   static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
   static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+  static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
+  static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
 
   /*! converts ISA bitvector into a string */
   std::string stringOfISA(int features);
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
index f4014be89b..530c3c7810 100644
--- a/thirdparty/embree/common/sys/thread.cpp
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -10,6 +10,9 @@
 #include "../simd/arm/emulation.h"
 #else
 #include <xmmintrin.h>
+#if defined(__EMSCRIPTEN__)
+#include "../simd/wasm/emulation.h"
+#endif
 #endif
 
 #if defined(PTHREADS_WIN32)
@@ -158,9 +161,7 @@ namespace embree
 /// Linux Platform
 ////////////////////////////////////////////////////////////////////////////////
 
-// -- GODOT start --
 #if defined(__LINUX__) && !defined(__ANDROID__)
-// -- GODOT end --
 
 #include <fstream>
 #include <sstream>
@@ -219,6 +220,8 @@ namespace embree
 
     /* find correct thread to affinitize to */
     cpu_set_t set;
+    CPU_ZERO(&set);
+    
     if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
     {
       for (int i=0, j=0; i<CPU_SETSIZE; i++)
@@ -241,7 +244,8 @@ namespace embree
   {
     cpu_set_t cset;
     CPU_ZERO(&cset);
-    size_t threadID = mapThreadID(affinity);
+    //size_t threadID = mapThreadID(affinity); // this is not working properly in LXC containers when some processors are disabled
+    size_t threadID = affinity;
     CPU_SET(threadID, &cset);
 
     pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
@@ -249,7 +253,6 @@ namespace embree
 }
 #endif
 
-// -- GODOT start --
 ////////////////////////////////////////////////////////////////////////////////
 /// Android Platform
 ////////////////////////////////////////////////////////////////////////////////
@@ -269,7 +272,6 @@ namespace embree
   }
 }
 #endif
-// -- GODOT end --
 
 ////////////////////////////////////////////////////////////////////////////////
 /// FreeBSD Platform
@@ -294,6 +296,21 @@ namespace embree
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
+/// WebAssembly Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__EMSCRIPTEN__)
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+      // Setting thread affinity is not supported in WASM.
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
 /// MacOSX Platform
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -379,9 +396,7 @@ namespace embree
     pthread_attr_destroy(&attr);
 
     /* set affinity */
-// -- GODOT start --
 #if defined(__LINUX__) && !defined(__ANDROID__)
-// -- GODOT end --
     if (threadID >= 0) {
       cpu_set_t cset;
       CPU_ZERO(&cset);
@@ -396,7 +411,6 @@ namespace embree
       CPU_SET(threadID, &cset);
       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
     }
-// -- GODOT start --
 #elif defined(__ANDROID__)
     if (threadID >= 0) {
       cpu_set_t cset;
@@ -405,7 +419,6 @@ namespace embree
       sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
     }
 #endif
-// -- GODOT end --
 
     return thread_t(tid);
   }
@@ -424,14 +437,12 @@ namespace embree
 
   /*! destroy a hardware thread by its handle */
   void destroyThread(thread_t tid) {
-// -- GODOT start --
 #if defined(__ANDROID__)
-    FATAL("Can't destroy threads on Android.");
+    FATAL("Can't destroy threads on Android."); // pthread_cancel not implemented.
 #else
     pthread_cancel(*(pthread_t*)tid);
     delete (pthread_t*)tid;
 #endif
-// -- GODOT end --
   }
 
   /*! creates thread local storage */
diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h
index f832626789..d05e1deb18 100644
--- a/thirdparty/embree/common/sys/vector.h
+++ b/thirdparty/embree/common/sys/vector.h
@@ -127,14 +127,15 @@ namespace embree
       {
         assert(!empty());
         size_active--;
-        alloc.destroy(&items[size_active]);
+        items[size_active].~T();
       }
 
       __forceinline void clear() 
       {
         /* destroy elements */
-        for (size_t i=0; i<size_active; i++)
-          alloc.destroy(&items[i]);
+        for (size_t i=0; i<size_active; i++){
+          items[i].~T();
+        }
         
         /* free memory */
         alloc.deallocate(items,size_alloced); 
@@ -178,8 +179,9 @@ namespace embree
         /* destroy elements */
         if (new_active < size_active) 
         {
-          for (size_t i=new_active; i<size_active; i++)
-            alloc.destroy(&items[i]);
+          for (size_t i=new_active; i<size_active; i++){
+            items[i].~T();
+          }
           size_active = new_active;
         }
 
@@ -195,7 +197,7 @@ namespace embree
         items = alloc.allocate(new_alloced);
         for (size_t i=0; i<size_active; i++) {
           ::new (&items[i]) T(std::move(old_items[i]));
-          alloc.destroy(&old_items[i]);
+          old_items[i].~T();
         }
 
         for (size_t i=size_active; i<new_active; i++) {
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
index 8fa6bb12fa..6cc2495195 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -143,7 +143,7 @@ namespace embree
 	/* allocate new task on right side of stack */
         size_t oldStackPtr = stackPtr;
         TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
-        new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
+        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
         right++;
 
 	/* also move left pointer */
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
index 35bd49849f..042ba7bc4c 100644
--- a/thirdparty/embree/common/tasking/taskschedulertbb.h
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -11,14 +11,8 @@
 #include "../sys/condition.h"
 #include "../sys/ref.h"
 
-#if defined(__WIN32__)
-// -- GODOT start --
-#if !defined(NOMINMAX)
-// -- GODOT end --
+#if defined(__WIN32__) && !defined(NOMINMAX)
 #  define NOMINMAX
-// -- GODOT start --
-#endif
-// -- GODOT end --
 #endif
 
 // We need to define these to avoid implicit linkage against
diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
index 4857e1e05e..894628e47c 100644
--- a/thirdparty/embree/include/embree3/rtcore_common.h
+++ b/thirdparty/embree/include/embree3/rtcore_common.h
@@ -19,9 +19,7 @@ typedef int ssize_t;
 #endif
 #endif
 
-// -- GODOT start --
-#if defined(_WIN32) && defined(_MSC_VER)
-// -- GODOT end --
+#if defined(_WIN32) && !defined(__MINGW32__)
 #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
 #else
 #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h
index 62b7b6f4dc..0b399ef040 100644
--- a/thirdparty/embree/include/embree3/rtcore_config.h
+++ b/thirdparty/embree/include/embree3/rtcore_config.h
@@ -1,4 +1,3 @@
-
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
@@ -6,23 +5,25 @@
 
 #define RTC_VERSION_MAJOR 3
 #define RTC_VERSION_MINOR 13
-#define RTC_VERSION_PATCH 1
-#define RTC_VERSION 31301
-#define RTC_VERSION_STRING "3.13.1"
+#define RTC_VERSION_PATCH 5
+#define RTC_VERSION 31305
+#define RTC_VERSION_STRING "3.13.5"
 
 #define RTC_MAX_INSTANCE_LEVEL_COUNT 1
 
 #define EMBREE_MIN_WIDTH 0
 #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
 
-#define EMBREE_STATIC_LIB
-/* #undef EMBREE_API_NAMESPACE */
+#if !defined(EMBREE_STATIC_LIB)
+#   define EMBREE_STATIC_LIB
+#endif
+/* #undef EMBREE_API_NAMESPACE*/
 
 #if defined(EMBREE_API_NAMESPACE)
 #  define RTC_NAMESPACE
-#  define RTC_NAMESPACE_BEGIN namespace  {
+#  define RTC_NAMESPACE_BEGIN namespace {
 #  define RTC_NAMESPACE_END }
-#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_NAMESPACE_USE using namespace;
 #  define RTC_API_EXTERN_C
 #  undef EMBREE_API_NAMESPACE
 #else
diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree3/rtcore_quaternion.h
index 6489fa3467..bd5fe1d89a 100644
--- a/thirdparty/embree/include/embree3/rtcore_quaternion.h
+++ b/thirdparty/embree/include/embree3/rtcore_quaternion.h
@@ -8,7 +8,7 @@
 RTC_NAMESPACE_BEGIN
 
 /*
- * Structure for transformation respresentation as a matrix decomposition using
+ * Structure for transformation representation as a matrix decomposition using
  * a quaternion
  */
 struct RTC_ALIGN(16) RTCQuaternionDecomposition
diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h
index 5878a3d402..34d87a2ce4 100644
--- a/thirdparty/embree/include/embree3/rtcore_scene.h
+++ b/thirdparty/embree/include/embree3/rtcore_scene.h
@@ -47,9 +47,12 @@ RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigne
 /* Detaches the geometry from the scene. */
 RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
 
-/* Gets a geometry handle from the scene. */
+/* Gets a geometry handle from the scene. This function is not thread safe and should get used during rendering. */
 RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
 
+/* Gets a geometry handle from the scene. This function is thread safe and should NOT get used during rendering. */
+RTC_API RTCGeometry rtcGetGeometryThreadSafe(RTCScene scene, unsigned int geomID);
+
 
 /* Commits the scene. */
 RTC_API void rtcCommitScene(RTCScene scene);
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
index 8f21e3254f..cba32ca73c 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_morton.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
@@ -411,7 +411,7 @@ namespace embree
           ReductionTy bounds[MAX_BRANCHING_FACTOR];
           if (current.size() > singleThreadThreshold)
           {
-            /*! parallel_for is faster than spawing sub-tasks */
+            /*! parallel_for is faster than spawning sub-tasks */
             parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
                 for (size_t i=r.begin(); i<r.end(); i++) {
                   bounds[i] = recurse(depth+1,children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
index f9a08d65cd..6e73c0d250 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
@@ -374,7 +374,7 @@ namespace embree
 
             const size_t begin = set.begin();
             const size_t end   = set.end();
-            const size_t center = (begin + end)/2;
+            const size_t center = (begin + end + 1) / 2;
 
             PrimInfoMB linfo = empty;
             for (size_t i=begin; i<center; i++)
@@ -594,7 +594,7 @@ namespace embree
             /* spawn tasks */
             if (unlikely(current.size() > cfg.singleThreadThreshold))
             {
-              /*! parallel_for is faster than spawing sub-tasks */
+              /*! parallel_for is faster than spawning sub-tasks */
               parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
                   for (size_t i=r.begin(); i<r.end(); i++) {
                     values[i] = recurse(children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
index fff4bf2a35..24c5faf8be 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -298,7 +298,7 @@ namespace embree
             /* spawn tasks */
             if (current.size() > cfg.singleThreadThreshold)
             {
-              /*! parallel_for is faster than spawing sub-tasks */
+              /*! parallel_for is faster than spawning sub-tasks */
               parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
                   for (size_t i=r.begin(); i<r.end(); i++) {
                     values[i] = recurse(children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h
index ee29d09ac9..41be6183b8 100644
--- a/thirdparty/embree/kernels/builders/heuristic_binning.h
+++ b/thirdparty/embree/kernels/builders/heuristic_binning.h
@@ -57,14 +57,12 @@ namespace embree
         __forceinline Vec3ia bin(const Vec3fa& p) const 
         {
           const vint4 i = floori((vfloat4(p)-ofs)*scale);
-#if 1
           assert(i[0] >= 0 && (size_t)i[0] < num); 
           assert(i[1] >= 0 && (size_t)i[1] < num);
           assert(i[2] >= 0 && (size_t)i[2] < num);
-          return Vec3ia(i);
-#else
+          
+          // we clamp to handle corner cases that could calculate out of bounds bin
           return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
-#endif
         }
 
         /*! faster but unsafe binning */
diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
index 4249d16ea1..354e283557 100644
--- a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
+++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
@@ -275,7 +275,7 @@ namespace embree
               openNodesBasedOnExtend(set);
 #endif
 
-            /* disable opening when unsufficient space for opening a node available */
+            /* disable opening when insufficient space for opening a node available */
             if (set.ext_range_size() < max_open_size-1) 
               set.set_ext_range(set.end()); /* disable opening */
           }
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h
index a6939ba258..8b3499ac8d 100644
--- a/thirdparty/embree/kernels/builders/heuristic_spatial.h
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h
@@ -159,27 +159,25 @@ namespace embree
         assert(binID < BINS);
         bounds  [binID][dim].extend(b);        
       }
-      
-      /*! bins an array of triangles */
-      template<typename SplitPrimitive>
-        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
       {
-        for (size_t i=0; i<N; i++)
+        for (size_t i=begin; i<end; i++)
         {
-          const PrimRef prim = prims[i];
+          const PrimRef& prim = source[i];
           unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
-
-          if (unlikely(splits == 1))
+          
+          if (unlikely(splits <= 1))
           {
             const vint4 bin = mapping.bin(center(prim.bounds()));
             for (size_t dim=0; dim<3; dim++) 
             {
               assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
-              numBegin[bin[dim]][dim]++;
-              numEnd  [bin[dim]][dim]++;
-              bounds  [bin[dim]][dim].extend(prim.bounds());
+              add(dim,bin[dim],bin[dim],bin[dim],prim.bounds());
             }
-          } 
+          }
           else
           {
             const vint4 bin0 = mapping.bin(prim.bounds().lower);
@@ -187,89 +185,44 @@ namespace embree
             
             for (size_t dim=0; dim<3; dim++) 
             {
+              if (unlikely(mapping.invalid(dim))) 
+                continue;
+              
               size_t bin;
-              PrimRef rest = prim;
               size_t l = bin0[dim];
               size_t r = bin1[dim];
-
+              
               // same bin optimization
               if (likely(l == r)) 
               {
-                numBegin[l][dim]++;
-                numEnd  [l][dim]++;
-                bounds  [l][dim].extend(prim.bounds());
+                add(dim,l,l,l,prim.bounds());
                 continue;
               }
-
-              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              size_t bin_start = bin0[dim];
+              size_t bin_end   = bin1[dim];
+              BBox3fa rest = prim.bounds();
+              
+              /* assure that split position always overlaps the primitive bounds */
+              while (bin_start < bin_end && mapping.pos(bin_start+1,dim) <= rest.lower[dim]) bin_start++;
+              while (bin_start < bin_end && mapping.pos(bin_end    ,dim) >= rest.upper[dim]) bin_end--;
+              
+              const auto splitter = splitterFactory(prim);
+              for (bin=bin_start; bin<bin_end; bin++) 
               {
                 const float pos = mapping.pos(bin+1,dim);
+                BBox3fa left,right;
+                splitter(rest,dim,pos,left,right);
                 
-                PrimRef left,right;
-                splitPrimitive(rest,(int)dim,pos,left,right);
-                if (unlikely(left.bounds().empty())) l++;                
-                bounds[bin][dim].extend(left.bounds());
+                if (unlikely(left.empty())) l++;                
+                extend(dim,bin,left);
                 rest = right;
               }
-              if (unlikely(rest.bounds().empty())) r--;
-              numBegin[l][dim]++;
-              numEnd  [r][dim]++;
-              bounds  [bin][dim].extend(rest.bounds());
+              if (unlikely(rest.empty())) r--;
+              add(dim,l,r,bin,rest);
             }
-          }
+          }              
         }
       }
-      
-      /*! bins a range of primitives inside an array */
-      template<typename SplitPrimitive>
-        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
-	bin(splitPrimitive,prims+begin,end-begin,mapping);
-      }
-
-      /*! bins an array of primitives */
-      template<typename PrimitiveSplitterFactory>
-        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
-      {
-        for (size_t i=begin; i<end; i++)
-        {
-          const PrimRef &prim = source[i];
-          const vint4 bin0 = mapping.bin(prim.bounds().lower);
-          const vint4 bin1 = mapping.bin(prim.bounds().upper);
-          
-          for (size_t dim=0; dim<3; dim++) 
-          {
-            if (unlikely(mapping.invalid(dim))) 
-              continue;
-            
-            size_t bin;
-            size_t l = bin0[dim];
-            size_t r = bin1[dim];
-            
-            // same bin optimization
-            if (likely(l == r)) 
-            {
-              add(dim,l,l,l,prim.bounds());
-              continue;
-            }
-            const size_t bin_start = bin0[dim];
-            const size_t bin_end   = bin1[dim];
-            BBox3fa rest = prim.bounds();
-            const auto splitter = splitterFactory(prim);
-            for (bin=bin_start; bin<bin_end; bin++) 
-            {
-              const float pos = mapping.pos(bin+1,dim);
-              BBox3fa left,right;
-              splitter(rest,dim,pos,left,right);
-              if (unlikely(left.empty())) l++;                
-              extend(dim,bin,left);
-              rest = right;
-            }
-            if (unlikely(rest.empty())) r--;
-            add(dim,l,r,bin,rest);
-          }
-        }              
-      }
-
 
 
       /*! bins an array of primitives */
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
index 60d235f48d..2584c19bda 100644
--- a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
@@ -241,7 +241,7 @@ namespace embree
           SpatialBinner binner(empty); 
           const SpatialBinMapping<SPATIAL_BINS> mapping(set);
           binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
-          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
           return binner.best(mapping,logBlockSize); //,set.ext_size());
         }
 
@@ -256,7 +256,7 @@ namespace embree
                                      binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
                                      return binner; },
                                    [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
-          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
           return binner.best(mapping,logBlockSize); //,set.ext_size());
         }
 
@@ -286,6 +286,7 @@ namespace embree
                 //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
                 //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
                 //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+
                 if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
                 {
                   assert(splits > 1);
@@ -384,8 +385,8 @@ namespace embree
           new (&lset) PrimInfoExtRange(begin,center,center,local_left);
           new (&rset) PrimInfoExtRange(center,end,end,local_right);
 
-          assert(area(lset.geomBounds) >= 0.0f);
-          assert(area(rset.geomBounds) >= 0.0f);
+          assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
+          assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
           return std::pair<size_t,size_t>(left_weight,right_weight);
         }
 
@@ -410,7 +411,7 @@ namespace embree
                                               begin,end,local_left,local_right,
                                               [&] (const PrimRef& ref) {
                                                 const Vec3fa c = ref.bounds().center();
-                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask);
                                               },
                                               [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
 
@@ -419,8 +420,8 @@ namespace embree
           
           new (&lset) PrimInfoExtRange(begin,center,center,local_left);
           new (&rset) PrimInfoExtRange(center,end,end,local_right);
-          assert(area(lset.geomBounds) >= 0.0f);
-          assert(area(rset.geomBounds) >= 0.0f);
+          assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
+          assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
           return std::pair<size_t,size_t>(left_weight,right_weight);
         }
 
diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
index d279dc4993..e2d7c27bd8 100644
--- a/thirdparty/embree/kernels/builders/primrefgen.cpp
+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
@@ -184,9 +184,7 @@ namespace embree
 
     // special variants for grid meshes
 
-// -- GODOT start --
 #if defined(EMBREE_GEOMETRY_GRID)
-// -- GODOT end --
     PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
     {
       PrimInfo pinfo(empty);
@@ -296,9 +294,7 @@ namespace embree
 
       return pinfo;
     }
-// -- GODOT start --
 #endif
-// -- GODOT end --
     
     // ====================================================================================================
     // ====================================================================================================
diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
index 8cd251ddd2..aa2026a85e 100644
--- a/thirdparty/embree/kernels/builders/primrefgen_presplit.h
+++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
@@ -266,7 +266,7 @@ namespace embree
       /* anything to split ? */
       if (center < numPrimitives)
       {
-        const size_t numPrimitivesToSplit = numPrimitives - center;
+        size_t numPrimitivesToSplit = numPrimitives - center;
         assert(presplitItem[center].priority >= 1.0f);
 
         /* sort presplit items in ascending order */
@@ -279,8 +279,8 @@ namespace embree
             });
           );
 
-        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
-        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+        unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
 
         /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
         const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
@@ -317,11 +317,16 @@ namespace embree
             sum += numSubPrims;
           }
           new_center++;
+
+          primOffset0 += new_center - center;
+          numPrimitivesToSplit -= new_center - center;
           center = new_center;
+          assert(numPrimitivesToSplit == (numPrimitives - center));
         }
 
         /* parallel prefix sum to compute offsets for storing sub-primitives */
         const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+        assert(numPrimitives+offset <= alloc_numPrimitives);
 
         /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
         parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
@@ -338,7 +343,7 @@ namespace embree
               unsigned int numSubPrims = 0;
               splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
               const size_t newID = numPrimitives + primOffset1[j-center];              
-              assert(newID+numSubPrims <= alloc_numPrimitives);
+              assert(newID+numSubPrims-1 <= alloc_numPrimitives);
               prims[primrefID] = subPrims[0];
               for (size_t i=1;i<numSubPrims;i++)
                 prims[newID+i-1] = subPrims[i];
diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h
index f7720bd284..da89d0b178 100644
--- a/thirdparty/embree/kernels/builders/splitter.h
+++ b/thirdparty/embree/kernels/builders/splitter.h
@@ -128,28 +128,30 @@ namespace embree
         const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
         const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
         QuadMesh::Quad quad = mesh->quad(prim.primID());
-        v[0] = mesh->vertex(quad.v[0]);
-        v[1] = mesh->vertex(quad.v[1]);
-        v[2] = mesh->vertex(quad.v[2]);
-        v[3] = mesh->vertex(quad.v[3]);
-        v[4] = mesh->vertex(quad.v[0]);
-        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
-        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
-        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
-        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+        v[0] = mesh->vertex(quad.v[1]);
+        v[1] = mesh->vertex(quad.v[2]);
+        v[2] = mesh->vertex(quad.v[3]);
+        v[3] = mesh->vertex(quad.v[0]);
+        v[4] = mesh->vertex(quad.v[1]);
+        v[5] = mesh->vertex(quad.v[3]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1] - v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2] - v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3] - v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[4] - v[3]);
+        inv_length[4] = Vec3fa(1.0f) / (v[5] - v[4]);
       }
       
       __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
-        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+        splitPolygon<5>(prim,dim,pos,v,left_o,right_o);
       }
       
       __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
-        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+        splitPolygon<5>(prim,dim,pos,v,inv_length,left_o,right_o);
       }
       
     private:
-      Vec3fa v[5];
-      Vec3fa inv_length[4];
+      Vec3fa v[6];
+      Vec3fa inv_length[5];
     };
     
     struct QuadSplitterFactory
diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp
index a84295f0da..f6cf626465 100644
--- a/thirdparty/embree/kernels/bvh/bvh.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh.cpp
@@ -183,7 +183,7 @@ namespace embree
   template class BVHN<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
   template class BVHN<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
index 6e9a5a538e..1d393fd06b 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
@@ -230,7 +230,7 @@ namespace embree
             continue;
 
           /* switch to single ray traversal */
-#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
 #if FORCE_SINGLE_MODE == 0
           if (single)
 #endif
@@ -676,7 +676,7 @@ namespace embree
           continue;
 
         /* switch to single ray traversal */
-#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
 #if FORCE_SINGLE_MODE == 0
         if (single)
 #endif
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
index 717f559677..c7e040fadb 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
@@ -170,12 +170,23 @@ namespace embree
           TravRayKStream<K,robust> &p = packets[rayID / K];
           const size_t i = rayID % K;
           const vint<N> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+          const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+          const vfloat<N> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<N> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<N> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+#else
           const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
           const vfloat<N> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<N> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<N> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+#endif
+
           const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
           const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));      
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
index 57530692bc..3fd9fc7d18 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
@@ -46,6 +46,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         AABBNode_t* node = ref.getAABBNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         return ref;
@@ -60,6 +68,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         AABBNode_t* node = ref.getAABBNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
index c4cea7d8ba..001f526c25 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
@@ -31,6 +31,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i].ref == NodeRef::emptyNode);
+          assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
+        }
+#endif
         AABBNodeMB_t* node = ref.getAABBNodeMB();
         
         LBBox3fa bounds = empty;
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
index 46a81d7581..3b966fd054 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -41,6 +41,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i].ref == NodeRef::emptyNode);
+          assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
+        }
+#endif
         if (likely(ref.isAABBNodeMB())) {
           for (size_t i=0; i<num; i++)
             ref.getAABBNodeMB()->set(i, children[i]);
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
index 2afc8c98e7..99671ddc5a 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
@@ -190,6 +190,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         QuantizedNode_t* node = ref.quantizedNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         return ref;
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
index d857ff7d95..57f75bfd7e 100644
--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -162,7 +162,7 @@ namespace embree
   template class BVHNStatistics<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
   template class BVHNStatistics<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h
index 1ec4fc63fc..17641fa888 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector1.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector1.h
@@ -5,6 +5,15 @@
 
 #include "node_intersector.h"
 
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
 namespace embree
 {
   namespace isa
@@ -29,9 +38,15 @@ namespace embree
         org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
         dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
         rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
         const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
         org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+          //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+          //x86 will use msub
+        neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
 #endif
         nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
         nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
@@ -49,8 +64,12 @@ namespace embree
         org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
         dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
         rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
-#if defined(__AVX2__) || defined(__ARM_NEON)
-	org_rdir = org*rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+        org_rdir = org*rdir;
+#else
+        neg_org_rdir = -(org*rdir);
+#endif
 #endif
 	nearX = nearXYZ.x[k];
 	nearY = nearXYZ.y[k];
@@ -62,8 +81,14 @@ namespace embree
 
       Vec3fa org_xyz, dir_xyz;
       Vec3vf<N> org, dir, rdir;
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
       Vec3vf<N> org_rdir;
+#else
+        //aarch64 version are keeping negation of the org_rdir and use madd
+        //x86 uses msub
+      Vec3vf<N> neg_org_rdir;
+#endif
 #endif
       size_t nearX, nearY, nearZ;
       size_t farX, farY, farZ;
@@ -404,13 +429,22 @@ namespace embree
     template<>
       __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
     {
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -450,13 +484,23 @@ namespace embree
     template<>
       __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
     {
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+
 #else
       const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -522,13 +566,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -537,7 +590,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
       const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool<N> vmask = asInt(tNear) > asInt(tFar);
@@ -598,13 +651,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined (__AVX2__) || defined(__ARM_NEON)
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -613,7 +675,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__)
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
       const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
       const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
 #else
@@ -687,13 +749,22 @@ namespace embree
       const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
       const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -703,7 +774,7 @@ namespace embree
       const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
 #endif
       
-#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
       const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool4 vmask = asInt(tNear) > asInt(tFar);
@@ -775,13 +846,22 @@ namespace embree
       const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
       const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -857,13 +937,22 @@ namespace embree
       const vfloat<N> upper_y   = node->dequantizeUpperY(time);
       const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
       const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
index 1f7215e5df..cad4e6de2d 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
@@ -75,9 +75,13 @@ namespace embree
         min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
         max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
 
+#if defined (__aarch64__)
+        neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+        neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
         min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
         max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
-
+#endif
         min_dist = reduced_min_dist;
         max_dist = reduced_max_dist;
 
@@ -95,9 +99,13 @@ namespace embree
       Vec3fa min_rdir;
       Vec3fa max_rdir;
 
+#if defined (__aarch64__)
+      Vec3fa neg_min_org_rdir;
+      Vec3fa neg_max_org_rdir;
+#else
       Vec3fa min_org_rdir;
       Vec3fa max_org_rdir;
-
+#endif
       float min_dist;
       float max_dist;
     };
@@ -191,13 +199,21 @@ namespace embree
       const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
       const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x));
+      const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y));
+      const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z));
+      const vfloat<N> fmaxX = madd(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.neg_max_org_rdir.x));
+      const vfloat<N> fmaxY = madd(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.neg_max_org_rdir.y));
+      const vfloat<N> fmaxZ = madd(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.neg_max_org_rdir.z));
+#else
       const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x));
       const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y));
       const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z));
       const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x));
       const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y));
       const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z));
-
+#endif
       const vfloat<N> fmin  = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
       dist = fmin;
       const vfloat<N> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
index d5498fc5db..4deacd620d 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
@@ -39,7 +39,9 @@ namespace embree
         org = ray_org;
         dir = ray_dir;
         rdir = rcp_safe(ray_dir);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+        neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
         org_rdir = org * rdir;
 #endif
 
@@ -55,7 +57,9 @@ namespace embree
       Vec3vf<K> org;
       Vec3vf<K> dir;
       Vec3vf<K> rdir;
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
       Vec3vf<K> org_rdir;
 #endif
       Vec3vi<K> nearXYZ;
@@ -119,7 +123,14 @@ namespace embree
                                          const TravRayKFast<K>& ray, vfloat<K>& dist)
 
     {
-  #if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -199,7 +210,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -302,7 +320,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -464,7 +489,14 @@ namespace embree
       const vfloat<N> lower_z = node->dequantizeLowerZ();
       const vfloat<N> upper_z = node->dequantizeUpperZ();
 
-  #if defined(__AVX2__) || defined(__ARM_NEON)
+  #if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+  #elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -549,7 +581,14 @@ namespace embree
         const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
         const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
         
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+        const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+        const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
         const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
         const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
         const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
index 55b2c27231..943fd7043f 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
@@ -32,11 +32,19 @@ namespace embree
       __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
       {
         rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(ray_org * rdir);
+#else
         org_rdir = ray_org * rdir;
+#endif
       }
 
       Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#else
       Vec3vf<K> org_rdir;
+#endif
       vfloat<K> tnear;
       vfloat<K> tfar;
     };
@@ -87,12 +95,21 @@ namespace embree
       const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
       const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
 
+#if defined (__aarch64__)
+      const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
+      const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
+      const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
+      const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
+      const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
+      const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
+#else
       const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
       const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
       const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
       const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
       const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
       const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+#endif
       const vfloat<N> rmin  = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
       const vfloat<N> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
 
@@ -113,12 +130,21 @@ namespace embree
       const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
       const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
       const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
 
       const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
       const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
index cc4ea1805b..d24326ce92 100644
--- a/thirdparty/embree/kernels/common/accel.h
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -332,7 +332,7 @@ namespace embree
         intersectorN.intersect(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         intersect4(&mask,(RTCRayHit4&)ray,context);
@@ -388,7 +388,7 @@ namespace embree
         intersectorN.occluded(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         occluded4(&mask,(RTCRay4&)ray,context);
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
index 32a27c560a..111c62083d 100644
--- a/thirdparty/embree/kernels/common/acceln.cpp
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -97,7 +97,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded4(valid,ray,context);
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       if (unlikely(none(valid0 & hit0))) break;
@@ -111,7 +111,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded8(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
@@ -127,7 +127,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded16(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h
index 90b184a07b..1b67120c97 100644
--- a/thirdparty/embree/kernels/common/accelset.h
+++ b/thirdparty/embree/kernels/common/accelset.h
@@ -14,21 +14,14 @@ namespace embree
   struct IntersectFunctionNArguments;
   struct OccludedFunctionNArguments;
   
-  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
-  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
-  
   struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
   {
-    IntersectContext* internal_context;
     Geometry* geometry;
-    ReportIntersectionFunc report;
   };
 
   struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
   {
-    IntersectContext* internal_context;
     Geometry* geometry;
-    ReportOcclusionFunc report;
   };
 
   /*! Base class for set of acceleration structures. */
@@ -145,7 +138,7 @@ namespace embree
   public:
 
       /*! Intersects a single ray with the scene. */
-      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
       {
         assert(primID < size());
         assert(intersectorN.intersect);
@@ -159,15 +152,13 @@ namespace embree
         args.N = 1;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.intersect(&args);
       }
 
       /*! Tests if single ray is occluded by the scene. */
-      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
       {
         assert(primID < size());
         assert(intersectorN.occluded);
@@ -181,16 +172,14 @@ namespace embree
         args.N = 1;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.occluded(&args);
       }
    
       /*! Intersects a packet of K rays with the scene. */
       template<int K>
-        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
       {
         assert(primID < size());
         assert(intersectorN.intersect);
@@ -204,16 +193,14 @@ namespace embree
         args.N = K;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
          
         intersectorN.intersect(&args);
       }
 
       /*! Tests if a packet of K rays is occluded by the scene. */
       template<int K>
-        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
       {
         assert(primID < size());
         assert(intersectorN.occluded);
@@ -227,9 +214,7 @@ namespace embree
         args.N = K;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.occluded(&args);
       }
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
index 1a0e1aeed3..38a76225f4 100644
--- a/thirdparty/embree/kernels/common/alloc.cpp
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -3,6 +3,9 @@
 
 #include "alloc.h"
 #include "../../common/sys/thread.h"
+#if defined(APPLE) && defined(__aarch64__)
+#include "../../common/sys/barrier.h"
+#endif
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
index 4458e35c24..12769df2c8 100644
--- a/thirdparty/embree/kernels/common/alloc.h
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -8,6 +8,10 @@
 #include "scene.h"
 #include "primref.h"
 
+#if defined(APPLE) && defined(__aarch64__)
+#include <mutex>
+#endif
+
 namespace embree
 {
   class FastAllocator
@@ -26,7 +30,7 @@ namespace embree
   public:
 
     struct ThreadLocal2;
-    enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
 
     /*! Per thread structure holding the current memory block. */
     struct __aligned(64) ThreadLocal
@@ -132,7 +136,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() == alloc_i) return;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
         if (alloc.load()) {
           alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
@@ -150,7 +158,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() != alloc_i) return;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
         alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
         alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
@@ -161,7 +173,11 @@ namespace embree
       }
 
     public:
+#if defined(APPLE) && defined(__aarch64__)
+      std::mutex mutex;
+#else
       SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
       std::atomic<FastAllocator*> alloc;  //!< parent allocator
       ThreadLocal alloc0;
       ThreadLocal alloc1;
@@ -169,7 +185,7 @@ namespace embree
 
     FastAllocator (Device* device, bool osAllocation) 
       : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
-        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
         primrefarray(device,0)
     {
       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
@@ -206,7 +222,7 @@ namespace embree
 
     void setOSallocation(bool flag)
     {
-      atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
     }
 
   private:
@@ -217,7 +233,11 @@ namespace embree
       ThreadLocal2* alloc = thread_local_allocator2;
       if (alloc == nullptr) {
         thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
         Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
         s_thread_local_allocators.push_back(make_unique(alloc));
       }
       return alloc;
@@ -227,7 +247,11 @@ namespace embree
 
     __forceinline void join(ThreadLocal2* alloc)
     {
+#if defined(APPLE) && defined(__aarch64__)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
       Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
       thread_local_allocators.push_back(alloc);
     }
 
@@ -492,7 +516,11 @@ namespace embree
         /* parallel block creation in case of no freeBlocks, avoids single global mutex */
         if (likely(freeBlocks.load() == nullptr))
         {
+#if defined(APPLE) && defined(__aarch64__)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
           Lock<SpinLock> lock(slotMutex[slot]);
+#endif
           if (myUsedBlocks == threadUsedBlocks[slot]) {
             const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
             const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
@@ -505,7 +533,11 @@ namespace embree
 
         /* if this fails allocate new block */
         {
-          Lock<SpinLock> lock(mutex);
+#if defined(APPLE) && defined(__aarch64__)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
 	  if (myUsedBlocks == threadUsedBlocks[slot])
 	  {
             if (freeBlocks.load() != nullptr) {
@@ -527,7 +559,11 @@ namespace embree
     /*! add new block */
     void addBlock(void* ptr, ssize_t bytes)
     {
+#if defined(APPLE) && defined(__aarch64__)
+      std::scoped_lock lock(mutex);
+#else
       Lock<SpinLock> lock(mutex);
+#endif
       const size_t sizeof_Header = offsetof(Block,data[0]);
       void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
       size_t ofs = (size_t) aptr - (size_t) ptr;
@@ -613,8 +649,8 @@ namespace embree
         bytesWasted(alloc->bytesWasted),
         stat_all(alloc,ANY_TYPE),
         stat_malloc(alloc,ALIGNED_MALLOC),
-        stat_4K(alloc,OS_MALLOC,false),
-        stat_2M(alloc,OS_MALLOC,true),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
         stat_shared(alloc,SHARED) {}
 
       AllStatistics (size_t bytesUsed,
@@ -707,7 +743,7 @@ namespace embree
         /* We avoid using os_malloc for small blocks as this could
          * cause a risk of fragmenting the virtual address space and
          * reach the limit of vm.max_map_count = 65k under Linux. */
-        if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
           atype = ALIGNED_MALLOC;
 
         /* we need to additionally allocate some header */
@@ -716,7 +752,7 @@ namespace embree
         bytesReserve  = sizeof_Header+bytesReserve;
 
         /* consume full 4k pages with using os_malloc */
-        if (atype == OS_MALLOC) {
+        if (atype == EMBREE_OS_MALLOC) {
           bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
           bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
         }
@@ -748,11 +784,11 @@ namespace embree
             return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
           }
         }
-        else if (atype == OS_MALLOC)
+        else if (atype == EMBREE_OS_MALLOC)
         {
           if (device) device->memoryMonitor(bytesAllocate,false);
           bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
-          return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
         }
         else
           assert(false);
@@ -796,7 +832,7 @@ namespace embree
           if (device) device->memoryMonitor(-sizeof_Alloced,true);
         }
 
-        else if (atype == OS_MALLOC) {
+        else if (atype == EMBREE_OS_MALLOC) {
          size_t sizeof_This = sizeof_Header+reserveEnd;
          os_free(this,sizeof_This,huge_pages);
          if (device) device->memoryMonitor(-sizeof_Alloced,true);
@@ -857,7 +893,7 @@ namespace embree
       bool hasType(AllocationType atype_i, bool huge_pages_i) const
       {
         if      (atype_i == ANY_TYPE ) return true;
-        else if (atype   == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
         else                           return atype_i == atype;
       }
 
@@ -906,7 +942,7 @@ namespace embree
       void print_block() const
       {
         if (atype == ALIGNED_MALLOC) std::cout << "A";
-        else if (atype == OS_MALLOC) std::cout << "O";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
         else if (atype == SHARED) std::cout << "S";
         if (huge_pages) std::cout << "H";
         size_t bytesUsed = getBlockUsedBytes();
@@ -936,7 +972,11 @@ namespace embree
     std::atomic<Block*> freeBlocks;
 
     std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
-    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(APPLE) && defined(__aarch64__)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+    PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
 
     bool use_single_mode;
     size_t defaultBlockSize;
@@ -950,7 +990,11 @@ namespace embree
     static __thread ThreadLocal2* thread_local_allocator2;
     static SpinLock s_thread_local_allocators_lock;
     static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(APPLE) && defined(__aarch64__)
+    std::mutex thread_local_allocators_lock;
+#else
     SpinLock thread_local_allocators_lock;
+#endif
     std::vector<ThreadLocal2*> thread_local_allocators;
     AllocationType atype;
     mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
index 068e0c2983..833ec65139 100644
--- a/thirdparty/embree/kernels/common/device.cpp
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -66,7 +66,11 @@ namespace embree
     case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
     case CPU::XEON_PHI_KNIGHTS_MILL   : frequency_level = FREQUENCY_SIMD512; break;
     case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
+#if defined(__APPLE__)
+    case CPU::ARM:             frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4
+#else
     case CPU::ARM:             frequency_level = FREQUENCY_SIMD128; break;
+#endif
     }
 
     /* initialize global state */
diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h
index 2f9f2e7c94..593990f5b1 100644
--- a/thirdparty/embree/kernels/common/geometry.h
+++ b/thirdparty/embree/kernels/common/geometry.h
@@ -91,7 +91,7 @@ namespace embree
 
     size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
     size_t numTriangles;             //!< number of enabled triangles
-    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numMBTriangles;           //!< number of enabled motion blurred triangles
     size_t numQuads;                 //!< number of enabled quads
     size_t numMBQuads;               //!< number of enabled motion blurred quads
     size_t numBezierCurves;          //!< number of enabled curves
@@ -99,7 +99,7 @@ namespace embree
     size_t numLineSegments;          //!< number of enabled line segments
     size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
     size_t numSubdivPatches;         //!< number of enabled subdivision patches
-    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blurred subdivision patches
     size_t numUserGeometries;        //!< number of enabled user geometries
     size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
     size_t numInstancesCheap;        //!< number of enabled cheap instances
diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h
index ae6556336c..9e1132e1a0 100644
--- a/thirdparty/embree/kernels/common/isa.h
+++ b/thirdparty/embree/kernels/common/isa.h
@@ -44,7 +44,7 @@ namespace embree
 #define SELECT_SYMBOL_DEFAULT(features,intersector) \
   intersector = isa::intersector;
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
 #if !defined(EMBREE_TARGET_SIMD4)
 #define EMBREE_TARGET_SIMD4
 #endif
diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h
index 7b951cc1e8..3c8ee3989c 100644
--- a/thirdparty/embree/kernels/common/ray.h
+++ b/thirdparty/embree/kernels/common/ray.h
@@ -6,7 +6,7 @@
 #include "default.h"
 #include "instance_stack.h"
 
-// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+// FIXME: if ray gets separated into ray* and hit, uload4 needs to be adjusted
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index 94b3819e42..a6ea55bfc4 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -7,6 +7,7 @@
 #include "device.h"
 #include "scene.h"
 #include "context.h"
+#include "../geometry/filter.h"
 #include "../../include/embree3/rtcore_ray.h"
 using namespace embree;
 
@@ -482,7 +483,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray4* ray4 = (Ray4*) rayhit;
+    RayHit4* ray4 = (RayHit4*) rayhit;
     for (size_t i=0; i<4; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray4->get(i,ray1);
@@ -513,7 +514,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray8* ray8 = (Ray8*) rayhit;
+    RayHit8* ray8 = (RayHit8*) rayhit;
     for (size_t i=0; i<8; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray8->get(i,ray1);
@@ -546,7 +547,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray16* ray16 = (Ray16*) rayhit;
+    RayHit16* ray16 = (RayHit16*) rayhit;
     for (size_t i=0; i<16; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray16->get(i,ray1);
@@ -1097,13 +1098,13 @@ RTC_NAMESPACE_BEGIN;
   RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
-    args->report(args,filter_args);
+    isa::reportIntersection1(args, filter_args);
   }
 
   RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
-    args->report(args,filter_args);
+    isa::reportOcclusion1(args,filter_args);
   }
   
   RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
@@ -1763,4 +1764,19 @@ RTC_NAMESPACE_BEGIN;
     return nullptr;
   }
 
+  RTC_API RTCGeometry rtcGetGeometryThreadSafe (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryThreadSafe);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    Ref<Geometry> geom = scene->get_locked(geomID);
+    return (RTCGeometry) geom.ptr; 
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
 RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index f8aad7c7cb..ac58a84d6f 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -26,56 +26,59 @@ namespace embree
 
 /*! Macros used in the rtcore API implementation */
 // -- GODOT start --
-// #define RTC_CATCH_BEGIN try {
 #define RTC_CATCH_BEGIN
-  
-// #define RTC_CATCH_END(device)                                                \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device::process_error(device,e.error,e.what());                             \
-//   } catch (std::exception& e) {                                                 \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//   } catch (...) {                                                               \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//   }
 #define RTC_CATCH_END(device)
-  
-// #define RTC_CATCH_END2(scene)                                                \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,e.error,e.what());                             \
-//   } catch (std::exception& e) {                                                 \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//   } catch (...) {                                                               \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//   }
 #define RTC_CATCH_END2(scene)
-
-// #define RTC_CATCH_END2_FALSE(scene)                                             \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//     return false;                                                               \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,e.error,e.what());                             \
-//     return false;                                                               \
-//   } catch (std::exception& e) {                                                 \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//     return false;                                                               \
-//   } catch (...) {                                                               \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//     return false;                                                               \
-//   }
 #define RTC_CATCH_END2_FALSE(scene) return false;
+
+#if 0
+#define RTC_CATCH_BEGIN try {
+  
+#define RTC_CATCH_END(device)                                                \
+  } catch (std::bad_alloc&) {                                                   \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+  } catch (rtcore_error& e) {                                                   \
+    Device::process_error(device,e.error,e.what());                             \
+  } catch (std::exception& e) {                                                 \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+  } catch (...) {                                                               \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+  }
+  
+#define RTC_CATCH_END2(scene)                                                \
+  } catch (std::bad_alloc&) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+  } catch (rtcore_error& e) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,e.error,e.what());                             \
+  } catch (std::exception& e) {                                                 \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+  } catch (...) {                                                               \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+  }
+
+#define RTC_CATCH_END2_FALSE(scene)                                             \
+  } catch (std::bad_alloc&) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+    return false;                                                               \
+  } catch (rtcore_error& e) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,e.error,e.what());                             \
+    return false;                                                               \
+  } catch (std::exception& e) {                                                 \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+    return false;                                                               \
+  } catch (...) {                                                               \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+    return false;                                                               \
+  }
+#endif
 // -- GODOT end --
 
 #define RTC_VERIFY_HANDLE(handle)                               \
@@ -103,39 +106,35 @@ namespace embree
 #define RTC_TRACE(x) 
 #endif
 
-// -- GODOT begin --
-//   /*! used to throw embree API errors */
-//   struct rtcore_error : public std::exception
-//   {
-//     __forceinline rtcore_error(RTCError error, const std::string& str)
-//       : error(error), str(str) {}
-//     
-//     ~rtcore_error() throw() {}
-//     
-//     const char* what () const throw () {
-//       return str.c_str();
-//     }
-//     
-//     RTCError error;
-//     std::string str;
-//   };
-// -- GODOT end --
+// -- GODOT start --
+#if 0
+  /*! used to throw embree API errors */
+  struct rtcore_error : public std::exception
+  {
+    __forceinline rtcore_error(RTCError error, const std::string& str)
+      : error(error), str(str) {}
+    
+    ~rtcore_error() throw() {}
+    
+    const char* what () const throw () {
+      return str.c_str();
+    }
+    
+    RTCError error;
+    std::string str;
+  };
+#endif
 
 #if defined(DEBUG) // only report file and line in debug mode
-  // -- GODOT begin --
-  // #define throw_RTCError(error,str) \
-  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
   #define throw_RTCError(error,str) \
     printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
-  // -- GODOT end --
+    // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
 #else
-  // -- GODOT begin --
-  // #define throw_RTCError(error,str) \
-  //   throw rtcore_error(error,str);
   #define throw_RTCError(error,str) \
     abort();
-  // -- GODOT end --
+    // throw rtcore_error(error,str);
 #endif
+// -- GODOT end --
 
 #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
   (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp
index 1f1b6f6ddf..29e3bdca20 100644
--- a/thirdparty/embree/kernels/common/rtcore_builder.cpp
+++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp
@@ -371,7 +371,7 @@ RTC_NAMESPACE_BEGIN
       bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
       bvh->allocator.reset();
 
-      /* switch between differnet builders based on quality level */
+      /* switch between different builders based on quality level */
       if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
         return rtcBuildBVHMorton(arguments);
       else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
index 408d7eae6f..65d31d0f81 100644
--- a/thirdparty/embree/kernels/common/scene.cpp
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -629,9 +629,7 @@ namespace embree
     if (geometry == null)
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
     
-    if (geometry->isEnabled()) {
-      setModified ();
-    }
+    setModified ();
     accels_deleteGeometry(unsigned(geomID));
     id_pool.deallocate((unsigned)geomID);
     geometries[geomID] = null;
diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h
index a5a39e42d4..a1ea45d3c7 100644
--- a/thirdparty/embree/kernels/common/scene_curves.h
+++ b/thirdparty/embree/kernels/common/scene_curves.h
@@ -452,6 +452,10 @@ namespace embree
           const Vec3fa n1 = normal(index+1,itime);
           if (!isvalid(n0) || !isvalid(n1))
             return false;
+
+	  const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
+	  if (!isvalid(b))
+	    return false;
         }
       }
       
@@ -612,6 +616,10 @@ namespace embree
           const Vec3fa dn1 = dnormal(index+1,itime);
           if (!isvalid(dn0) || !isvalid(dn1))
             return false;
+
+	  const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
+	  if (!isvalid(b))
+	    return false;
         }
       }
       
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
index 01c862da0c..db6b803041 100644
--- a/thirdparty/embree/kernels/common/state.cpp
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -144,7 +144,20 @@ namespace embree
   }
 
   bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
     return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
   }
   
   void State::verify()
@@ -157,8 +170,10 @@ namespace embree
      * functions */
 #if defined(DEBUG)
 #if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
     assert(sse2::getISA() <= SSE2);
 #endif
+#endif
 #if defined(EMBREE_TARGET_SSE42)
     assert(sse42::getISA() <= SSE42);
 #endif
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
index 2bf7e93587..84ac27d103 100644
--- a/thirdparty/embree/kernels/config.h
+++ b/thirdparty/embree/kernels/config.h
@@ -1,5 +1,4 @@
-
-// Copyright 2009-2020 Intel Corporation
+// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 /* #undef EMBREE_RAY_MASK */
@@ -20,6 +19,7 @@
 /* #undef EMBREE_COMPACT_POLYS */
 
 #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
 
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
   #define IF_ENABLED_TRIS(x) x
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
index 3d8900c2aa..75532f5ae0 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
@@ -225,7 +225,7 @@ namespace embree
           /* exit if convergence cannot get proven, but terminate if we are very small */
           if (unlikely(!subset(K,x) && !very_small)) return false;
 
-          /* solve using newton raphson iteration of convergence is guarenteed */
+          /* solve using newton raphson iteration of convergence is guaranteed */
           solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
           return true;
         }
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
index 2d4abd73ac..ed827d583f 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
@@ -60,7 +60,7 @@ namespace embree
       const Vec3fa dir = ray.dir;
       const float length_ray_dir = length(dir);
 
-      /* error of curve evaluations is propertional to largest coordinate */
+      /* error of curve evaluations is proportional to largest coordinate */
       const BBox3ff box = curve.bounds();
       const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
      
diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h
index 816c066899..ec6fa9c4f3 100644
--- a/thirdparty/embree/kernels/geometry/disc_intersector.h
+++ b/thirdparty/embree/kernels/geometry/disc_intersector.h
@@ -68,15 +68,15 @@ namespace embree
         const Vec3vf<M> center = v0.xyz();
         const vfloat<M> radius = v0.w;
 
+        /* compute ray distance projC0 to hit point with ray oriented plane */
         const Vec3vf<M> c0     = center - ray_org;
         const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
 
         valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
-        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
-          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
         if (unlikely(none(valid)))
           return false;
-        
+
+        /* check if hit point lies inside disc */
         const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
         const vfloat<M> l2     = dot(perp, perp);
         const vfloat<M> r2     = radius * radius;
@@ -84,6 +84,15 @@ namespace embree
         if (unlikely(none(valid)))
           return false;
 
+        /* We reject hits where the ray origin lies inside the ray
+         * oriented disc to avoid self intersections. */
+#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
+        const vfloat<M> m2 = dot(c0, c0);
+        valid &= (m2 > r2);
+        if (unlikely(none(valid)))
+          return false;
+#endif
+        
         DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
         return epilog(valid, hit);
       }
@@ -152,15 +161,15 @@ namespace embree
         const Vec3vf<M> center = v0.xyz();
         const vfloat<M> radius = v0.w;
 
+        /* compute ray distance projC0 to hit point with ray oriented plane */
         const Vec3vf<M> c0     = center - ray_org;
         const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
 
         valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
-        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
-          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
         if (unlikely(none(valid)))
           return false;
 
+        /* check if hit point lies inside disc */
         const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
         const vfloat<M> l2     = dot(perp, perp);
         const vfloat<M> r2     = radius * radius;
@@ -168,6 +177,15 @@ namespace embree
         if (unlikely(none(valid)))
           return false;
 
+        /* We reject hits where the ray origin lies inside the ray
+         * oriented disc to avoid self intersections. */
+#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
+        const vfloat<M> m2 = dot(c0, c0);
+        valid &= (m2 > r2);
+        if (unlikely(none(valid)))
+          return false;
+#endif
+
         DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
         return epilog(valid, hit);
       }
diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h
index 3b4d924ea7..d64320bf78 100644
--- a/thirdparty/embree/kernels/geometry/filter.h
+++ b/thirdparty/embree/kernels/geometry/filter.h
@@ -51,20 +51,11 @@ namespace embree
     __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
     {
 #if defined(EMBREE_FILTER_FUNCTION)
-      IntersectContext* MAYBE_UNUSED context = args->internal_context;
-      const Geometry* const geometry = args->geometry;
-      if (geometry->intersectionFilterN) {
-        assert(context->scene->hasGeometryFilterFunction());
-        geometry->intersectionFilterN(filter_args);
-      }
+      if (args->geometry->intersectionFilterN)
+        args->geometry->intersectionFilterN(filter_args);
       
-      //if (args->valid[0] == 0)
-      //  return;
-
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(filter_args);
-      }
+      if (args->context->filter)
+        args->context->filter(filter_args);
 #endif
     }
     
@@ -105,20 +96,11 @@ namespace embree
     __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
     {
 #if defined(EMBREE_FILTER_FUNCTION)
-      IntersectContext* MAYBE_UNUSED context = args->internal_context;
-      const Geometry* const geometry = args->geometry;
-      if (geometry->occlusionFilterN) {
-        assert(context->scene->hasGeometryFilterFunction());
-        geometry->occlusionFilterN(filter_args);
-      }
-      
-      //if (args->valid[0] == 0)
-      //  return false;
+      if (args->geometry->occlusionFilterN)
+        args->geometry->occlusionFilterN(filter_args);
       
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(filter_args);
-      }
+      if (args->context->filter)
+        args->context->filter(filter_args);
 #endif
     }
 
diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h
index 11ceb2f7fe..e4ad01852f 100644
--- a/thirdparty/embree/kernels/geometry/object_intersector.h
+++ b/thirdparty/embree/kernels/geometry/object_intersector.h
@@ -32,7 +32,7 @@ namespace embree
           return;
 #endif
 
-        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+        accel->intersect(ray,prim.geomID(),prim.primID(),context);
       }
       
       static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
@@ -44,7 +44,7 @@ namespace embree
           return false;
 #endif
 
-        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        accel->occluded(ray,prim.geomID(),prim.primID(),context);
         return ray.tfar < 0.0f;
       }
       
@@ -89,7 +89,7 @@ namespace embree
         valid &= (ray.mask & accel->mask) != 0;
         if (none(valid)) return;
 #endif
-        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context);
       }
 
       static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
@@ -102,7 +102,7 @@ namespace embree
         valid &= (ray.mask & accel->mask) != 0;
         if (none(valid)) return false;
 #endif
-        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context);
         return ray.tfar < 0.0f;
       }
       
diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h
index 2137356ff2..514e519b0c 100644
--- a/thirdparty/embree/kernels/geometry/quadv.h
+++ b/thirdparty/embree/kernels/geometry/quadv.h
@@ -152,7 +152,7 @@ namespace embree
     Vec3vf<M> v0;      // 1st vertex of the quads
     Vec3vf<M> v1;      // 2nd vertex of the quads
     Vec3vf<M> v2;      // 3rd vertex of the quads
-    Vec3vf<M> v3;      // 4rd vertex of the quads
+    Vec3vf<M> v3;      // 4th vertex of the quads
   private:
     vuint<M> geomIDs; // geometry ID
     vuint<M> primIDs; // primitive ID
diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h
index 0e9393442b..764ff93fec 100644
--- a/thirdparty/embree/kernels/geometry/roundline_intersector.h
+++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h
@@ -19,7 +19,7 @@
 
   For multiple connected round linear curve segments this construction
   yield a proper shape when viewed from the outside. Using the
-  following CSG we can also handle the interiour in most common cases:
+  following CSG we can also handle the interior in most common cases:
 
      round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
        cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
@@ -431,7 +431,7 @@ namespace embree
              Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
            
            Inserting the definition of w0 and dw and refactoring
-           yield a furhter scaled Ng'':
+           yield a further scaled Ng'':
            
              Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
            
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
index ad5fee2e4e..e241073812 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_intersector.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
@@ -264,8 +264,8 @@ namespace embree
           const Vec3vf<K> p2 = vtx[i*4+2];
           const Vec3vf<K> p3 = vtx[i*4+3];
           STAT3(shadow.trav_prims,1,popcnt(valid0),K);
-          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-            break;
+          pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i));
+          if (none(valid0)) break;
         }
         return !valid0;
       }
@@ -408,10 +408,8 @@ namespace embree
           const Vec3vf<K> p2 = vtx[i*4+2];
           const Vec3vf<K> p3 = vtx[i*4+3];
           STAT3(shadow.trav_prims,1,popcnt(valid0),K);
-          //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-          if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-	    
-            break;
+          pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i));
+          if (none(valid0)) break;
         }
         return !valid0;
       }
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
index 470e15f03e..39d50e2354 100644
--- a/thirdparty/embree/kernels/hash.h
+++ b/thirdparty/embree/kernels/hash.h
@@ -1,5 +1,4 @@
-
-// Copyright 2009-2020 Intel Corporation
+// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"
+#define RTC_HASH "698442324ccddd11725fb8875275dc1384f7fb40"
diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h
index 2ff03902a7..0a2aef321f 100644
--- a/thirdparty/embree/kernels/subdiv/bezier_patch.h
+++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h
@@ -94,7 +94,7 @@ namespace embree
       matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
       matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
       
-      /* compute buttom edge control points */
+      /* compute bottom edge control points */
       matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
       matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
       
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
index e5ad5dadfe..eab91d9ee6 100644
--- a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
@@ -388,7 +388,7 @@ namespace embree
       return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
     }
     
-    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    /* gets limit tangent in the direction of edge vtx -> ring[0] */
     __forceinline Vertex getLimitTangent() const 
     {
       if (unlikely(std::isinf(vertex_crease_weight)))
@@ -429,7 +429,7 @@ namespace embree
       return sigma * (alpha + beta);
     }
     
-    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */
     __forceinline Vertex getSecondLimitTangent() const 
     {
       if (unlikely(std::isinf(vertex_crease_weight)))
@@ -763,7 +763,7 @@ namespace embree
     }
 
 
-    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    /* gets limit tangent in the direction of edge vtx -> ring[0] */
     __forceinline Vertex getLimitTangent() const 
     {
       CatmullClark1Ring cc_vtx;
@@ -779,7 +779,7 @@ namespace embree
       return 2.0f * cc_vtx.getLimitTangent();
     }
 
-    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */
     __forceinline Vertex getSecondLimitTangent() const 
     {
       CatmullClark1Ring cc_vtx;
diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
index 74fc4c1230..9532287d98 100644
--- a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
+++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
@@ -8,7 +8,7 @@
 
 /*
 
-  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  Implements Catmull-Rom curves with control points p0, p1, p2, p3. At
   t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
   the curve goes through p2 with tangent (p3-p2)/2.
 
@@ -91,11 +91,11 @@ namespace embree
         : v0(v0), v1(v1), v2(v2), v3(v3) {}
 
       __forceinline Vertex begin() const {
-        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+        return v1;
       }
 
       __forceinline Vertex end() const {
-        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+        return v2;
       }
 
       __forceinline Vertex center() const {
diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
index f8e8a25f35..dcdb101d7c 100644
--- a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
+++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
@@ -81,29 +81,29 @@ namespace embree
         {
           SourceCurve<Vec3ff> vcurve = center;
           SourceCurve<Vec3fa> ncurve = normal;
-          
+
           /* here we construct a patch which follows the curve l(t) =
            * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
           
           const Vec3ff p0   = vcurve.eval(0.0f);
           const Vec3ff dp0  = vcurve.eval_du(0.0f);
-          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+          //const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); // ddp0 is assumed to be 0
 
           const Vec3fa n0   = ncurve.eval(0.0f);
           const Vec3fa dn0  = ncurve.eval_du(0.0f);
 
           const Vec3ff p1   = vcurve.eval(1.0f);
           const Vec3ff dp1  = vcurve.eval_du(1.0f);
-          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+          //const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);  // ddp1 is assumed to be 0
 
           const Vec3fa n1   = ncurve.eval(1.0f);
           const Vec3fa dn1  = ncurve.eval_du(1.0f);
 
           const Vec3fa bt0  = cross(n0,dp0);
-          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+          const Vec3fa dbt0 = cross(dn0,dp0);// + cross(n0,ddp0);
 
           const Vec3fa bt1  = cross(n1,dp1);
-          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+          const Vec3fa dbt1 = cross(dn1,dp1);// + cross(n1,ddp1);
             
           const Vec3fa k0  = normalize(bt0);
           const Vec3fa dk0 = dnormalize(bt0,dbt0);
diff --git a/thirdparty/embree/patches/godot-changes-android.patch b/thirdparty/embree/patches/godot-changes-android.patch
deleted file mode 100644
index a27f924bde..0000000000
--- a/thirdparty/embree/patches/godot-changes-android.patch
+++ /dev/null
@@ -1,103 +0,0 @@
-diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
-index ba97dc227b..1679599608 100644
---- a/thirdparty/embree/common/sys/sysinfo.cpp
-+++ b/thirdparty/embree/common/sys/sysinfo.cpp
-@@ -618,7 +618,10 @@ namespace embree
-     static int nThreads = -1;
-     if (nThreads != -1) return nThreads;
- 
--#if defined(__MACOSX__)
-+// -- GODOT start --
-+// #if defined(__MACOSX__)
-+#if defined(__MACOSX__) || defined(__ANDROID__)
-+// -- GODOT end --
-     nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
-     assert(nThreads);
- #else
-diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
-index a7827e18f7..f4014be89b 100644
---- a/thirdparty/embree/common/sys/thread.cpp
-+++ b/thirdparty/embree/common/sys/thread.cpp
-@@ -158,7 +158,9 @@ namespace embree
- /// Linux Platform
- ////////////////////////////////////////////////////////////////////////////////
- 
--#if defined(__LINUX__)
-+// -- GODOT start --
-+#if defined(__LINUX__) && !defined(__ANDROID__)
-+// -- GODOT end --
- 
- #include <fstream>
- #include <sstream>
-@@ -247,6 +249,28 @@ namespace embree
- }
- #endif
- 
-+// -- GODOT start --
-+////////////////////////////////////////////////////////////////////////////////
-+/// Android Platform
-+////////////////////////////////////////////////////////////////////////////////
-+
-+#if defined(__ANDROID__)
-+
-+namespace embree
-+{
-+  /*! set affinity of the calling thread */
-+  void setAffinity(ssize_t affinity)
-+  {
-+    cpu_set_t cset;
-+    CPU_ZERO(&cset);
-+    CPU_SET(affinity, &cset);
-+
-+    sched_setaffinity(0, sizeof(cset), &cset);
-+  }
-+}
-+#endif
-+// -- GODOT end --
-+
- ////////////////////////////////////////////////////////////////////////////////
- /// FreeBSD Platform
- ////////////////////////////////////////////////////////////////////////////////
-@@ -355,7 +379,9 @@ namespace embree
-     pthread_attr_destroy(&attr);
- 
-     /* set affinity */
--#if defined(__LINUX__)
-+// -- GODOT start --
-+#if defined(__LINUX__) && !defined(__ANDROID__)
-+// -- GODOT end --
-     if (threadID >= 0) {
-       cpu_set_t cset;
-       CPU_ZERO(&cset);
-@@ -370,7 +396,16 @@ namespace embree
-       CPU_SET(threadID, &cset);
-       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
-     }
-+// -- GODOT start --
-+#elif defined(__ANDROID__)
-+    if (threadID >= 0) {
-+      cpu_set_t cset;
-+      CPU_ZERO(&cset);
-+      CPU_SET(threadID, &cset);
-+      sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
-+    }
- #endif
-+// -- GODOT end --
- 
-     return thread_t(tid);
-   }
-@@ -389,8 +424,14 @@ namespace embree
- 
-   /*! destroy a hardware thread by its handle */
-   void destroyThread(thread_t tid) {
-+// -- GODOT start --
-+#if defined(__ANDROID__)
-+    FATAL("Can't destroy threads on Android.");
-+#else
-     pthread_cancel(*(pthread_t*)tid);
-     delete (pthread_t*)tid;
-+#endif
-+// -- GODOT end --
-   }
- 
-   /*! creates thread local storage */
diff --git a/thirdparty/embree/patches/godot-changes-misc.patch b/thirdparty/embree/patches/godot-changes-misc.patch
deleted file mode 100644
index 8bf0d9fa97..0000000000
--- a/thirdparty/embree/patches/godot-changes-misc.patch
+++ /dev/null
@@ -1,105 +0,0 @@
-diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
-index 79729c87ab..ed8dd7d40a 100644
---- a/thirdparty/embree/common/sys/intrinsics.h
-+++ b/thirdparty/embree/common/sys/intrinsics.h
-@@ -34,8 +34,14 @@
- #endif
- 
- #if defined(__WIN32__)
--#  define NOMINMAX
--#  include <windows.h>
-+// -- GODOT start --
-+#if !defined(NOMINMAX)
-+// -- GODOT end --
-+#define NOMINMAX
-+// -- GODOT start --
-+#endif
-+#include "windows.h"
-+// -- GODOT end --
- #endif
- 
- /* normally defined in pmmintrin.h, but we always need this */
-diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
-index 3fc5e99b8d..697e07bb86 100644
---- a/thirdparty/embree/common/sys/platform.h
-+++ b/thirdparty/embree/common/sys/platform.h
-@@ -99,7 +99,9 @@
- #define dll_import 
- #endif
- 
--#ifdef __WIN32__
-+// -- GODOT start --
-+#if defined(__WIN32__) && !defined(__MINGW32__)
-+// -- GODOT end --
- #if !defined(__noinline)
- #define __noinline             __declspec(noinline)
- #endif
-@@ -149,6 +151,9 @@
-   #define DELETED  = delete
- #endif
- 
-+// -- GODOT start --
-+#if !defined(likely)
-+// -- GODOT end --
- #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
- #define   likely(expr) (expr)
- #define unlikely(expr) (expr)
-@@ -156,6 +161,9 @@
- #define   likely(expr) __builtin_expect((bool)(expr),true )
- #define unlikely(expr) __builtin_expect((bool)(expr),false)
- #endif
-+// -- GODOT start --
-+#endif
-+// -- GODOT end --
- 
- ////////////////////////////////////////////////////////////////////////////////
- /// Error handling and debugging
-diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
-index ba97dc227b..f1a59e511e 100644
---- a/thirdparty/embree/common/sys/sysinfo.cpp
-+++ b/thirdparty/embree/common/sys/sysinfo.cpp
-@@ -248,7 +248,9 @@ namespace embree
- #if defined(__X86_ASM__)
-   __noinline int64_t get_xcr0() 
-   {
--#if defined (__WIN32__)
-+// -- GODOT start --
-+#if defined (__WIN32__) && !defined (__MINGW32__)
-+// -- GODOT end --
-     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
-     xcr0 = _xgetbv(0);
-     return xcr0;
-diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
-index 9c14b28745..4857e1e05e 100644
---- a/thirdparty/embree/include/embree3/rtcore_common.h
-+++ b/thirdparty/embree/include/embree3/rtcore_common.h
-@@ -19,7 +19,9 @@ typedef int ssize_t;
- #endif
- #endif
- 
--#ifdef _WIN32
-+// -- GODOT start --
-+#if defined(_WIN32) && defined(_MSC_VER)
-+// -- GODOT end --
- #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
- #else
- #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
-diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
-index 3fd15816e9..35bd49849f 100644
---- a/thirdparty/embree/common/tasking/taskschedulertbb.h
-+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
-@@ -12,7 +12,13 @@
- #include "../sys/ref.h"
- 
- #if defined(__WIN32__)
-+// -- GODOT start --
-+#if !defined(NOMINMAX)
-+// -- GODOT end --
- #  define NOMINMAX
-+// -- GODOT start --
-+#endif
-+// -- GODOT end --
- #endif
- 
- // We need to define these to avoid implicit linkage against
- 
-\ No newline at end of file
diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch
index 598a7f2ddc..84169c36e4 100644
--- a/thirdparty/embree/patches/godot-changes-noexcept.patch
+++ b/thirdparty/embree/patches/godot-changes-noexcept.patch
@@ -1,5 +1,5 @@
 diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
-index f052d8b468..645681ac63 100644
+index f2969a88f1..6d411e4852 100644
 --- a/thirdparty/embree/common/algorithms/parallel_for.h
 +++ b/thirdparty/embree/common/algorithms/parallel_for.h
 @@ -21,7 +21,10 @@ namespace embree
@@ -12,9 +12,9 @@ index f052d8b468..645681ac63 100644
 +        abort();
 +        // -- GODOT end --
      }
-     
  #elif defined(TASKING_TBB)
-@@ -31,13 +34,19 @@ namespace embree
+   #if TBB_INTERFACE_VERSION >= 12002
+@@ -30,13 +33,19 @@ namespace embree
          func(i);
        },context);
      if (context.is_group_execution_cancelled())
@@ -36,7 +36,7 @@ index f052d8b468..645681ac63 100644
    #endif
  
  #elif defined(TASKING_PPL)
-@@ -57,7 +66,10 @@ namespace embree
+@@ -56,7 +65,10 @@ namespace embree
  #if defined(TASKING_INTERNAL)
      TaskScheduler::spawn(first,last,minStepSize,func);
      if (!TaskScheduler::wait())
@@ -48,7 +48,7 @@ index f052d8b468..645681ac63 100644
  
  #elif defined(TASKING_TBB)
    #if TBB_INTERFACE_VERSION >= 12002
-@@ -66,13 +78,19 @@ namespace embree
+@@ -65,13 +77,19 @@ namespace embree
          func(range<Index>(r.begin(),r.end()));
        },context);
      if (context.is_group_execution_cancelled())
@@ -70,7 +70,7 @@ index f052d8b468..645681ac63 100644
    #endif
  
  #elif defined(TASKING_PPL)
-@@ -104,13 +122,19 @@ namespace embree
+@@ -103,13 +121,19 @@ namespace embree
            func(i);
          },tbb::simple_partitioner(),context);
        if (context.is_group_execution_cancelled())
@@ -92,7 +92,7 @@ index f052d8b468..645681ac63 100644
      #endif
    }
  
-@@ -125,13 +149,19 @@ namespace embree
+@@ -124,13 +148,19 @@ namespace embree
            func(i);
          },ap,context);
        if (context.is_group_execution_cancelled())
@@ -115,7 +115,7 @@ index f052d8b468..645681ac63 100644
    }
  
 diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
-index f42ae2ec50..8271372ea4 100644
+index 1a94aad8c4..cd0078f2e6 100644
 --- a/thirdparty/embree/common/algorithms/parallel_reduce.h
 +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
 @@ -58,15 +58,19 @@ namespace embree
@@ -247,10 +247,10 @@ index 1bc30fe9a5..abdd269069 100644
  
    /* hint for transparent huge pages (THP) */
 diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
-index 8a6d9fa0a9..697e07bb86 100644
+index be3ec36436..728bf6ed7d 100644
 --- a/thirdparty/embree/common/sys/platform.h
 +++ b/thirdparty/embree/common/sys/platform.h
-@@ -179,11 +179,19 @@
+@@ -178,11 +178,19 @@
  #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
  
  #if defined(DEBUG) // only report file and line in debug mode
@@ -351,7 +351,7 @@ index dca835a716..ad438588a3 100644
  
    bool TaskScheduler::steal_from_other_threads(Thread& thread)
 diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
-index c766a0bb6a..8fa6bb12fa 100644
+index 61a0e57c5b..6cc2495195 100644
 --- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
 +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
 @@ -123,7 +123,10 @@ namespace embree
@@ -391,7 +391,7 @@ index c766a0bb6a..8fa6bb12fa 100644
      /*! steals a task from a different thread */
      bool steal_from_other_threads(Thread& thread);
 diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
-index d8da78eed7..d857ff7d95 100644
+index 40f9043736..57f75bfd7e 100644
 --- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
 +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
 @@ -150,7 +150,10 @@ namespace embree
@@ -407,10 +407,10 @@ index d8da78eed7..d857ff7d95 100644
      return s;
    } 
 diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
-index 74e9fb335c..94b3819e42 100644
+index 95a94319ec..a6ea55bfc4 100644
 --- a/thirdparty/embree/kernels/common/rtcore.cpp
 +++ b/thirdparty/embree/kernels/common/rtcore.cpp
-@@ -197,7 +197,10 @@ RTC_NAMESPACE_BEGIN;
+@@ -198,7 +198,10 @@ RTC_NAMESPACE_BEGIN;
      if (quality != RTC_BUILD_QUALITY_LOW &&
          quality != RTC_BUILD_QUALITY_MEDIUM &&
          quality != RTC_BUILD_QUALITY_HIGH)
@@ -422,7 +422,7 @@ index 74e9fb335c..94b3819e42 100644
      scene->setBuildQuality(quality);
      RTC_CATCH_END2(scene);
    }
-@@ -1350,7 +1353,10 @@ RTC_NAMESPACE_BEGIN;
+@@ -1351,7 +1354,10 @@ RTC_NAMESPACE_BEGIN;
          quality != RTC_BUILD_QUALITY_MEDIUM &&
          quality != RTC_BUILD_QUALITY_HIGH &&
          quality != RTC_BUILD_QUALITY_REFIT)
@@ -435,172 +435,67 @@ index 74e9fb335c..94b3819e42 100644
      RTC_CATCH_END2(geometry);
    }
 diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
-index 4e4b24e9c2..373e49a689 100644
+index 4e4b24e9c2..ac58a84d6f 100644
 --- a/thirdparty/embree/kernels/common/rtcore.h
 +++ b/thirdparty/embree/kernels/common/rtcore.h
-@@ -25,52 +25,58 @@ namespace embree
+@@ -25,6 +25,13 @@ namespace embree
  #endif
  
  /*! Macros used in the rtcore API implementation */
--#define RTC_CATCH_BEGIN try {
 +// -- GODOT start --
-+// #define RTC_CATCH_BEGIN try {
 +#define RTC_CATCH_BEGIN
-   
--#define RTC_CATCH_END(device)                                                \
--  } catch (std::bad_alloc&) {                                                   \
--    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
--  } catch (rtcore_error& e) {                                                   \
--    Device::process_error(device,e.error,e.what());                             \
--  } catch (std::exception& e) {                                                 \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
--  } catch (...) {                                                               \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
--  }
-+// #define RTC_CATCH_END(device)                                                \
-+//   } catch (std::bad_alloc&) {                                                   \
-+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-+//   } catch (rtcore_error& e) {                                                   \
-+//     Device::process_error(device,e.error,e.what());                             \
-+//   } catch (std::exception& e) {                                                 \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-+//   } catch (...) {                                                               \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-+//   }
 +#define RTC_CATCH_END(device)
-   
--#define RTC_CATCH_END2(scene)                                                \
--  } catch (std::bad_alloc&) {                                                   \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
--  } catch (rtcore_error& e) {                                                   \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,e.error,e.what());                             \
--  } catch (std::exception& e) {                                                 \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
--  } catch (...) {                                                               \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
--  }
-+// #define RTC_CATCH_END2(scene)                                                \
-+//   } catch (std::bad_alloc&) {                                                   \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-+//   } catch (rtcore_error& e) {                                                   \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,e.error,e.what());                             \
-+//   } catch (std::exception& e) {                                                 \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-+//   } catch (...) {                                                               \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-+//   }
 +#define RTC_CATCH_END2(scene)
- 
--#define RTC_CATCH_END2_FALSE(scene)                                             \
--  } catch (std::bad_alloc&) {                                                   \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
--    return false;                                                               \
--  } catch (rtcore_error& e) {                                                   \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,e.error,e.what());                             \
--    return false;                                                               \
--  } catch (std::exception& e) {                                                 \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
--    return false;                                                               \
--  } catch (...) {                                                               \
--    Device* device = scene ? scene->device : nullptr;                           \
--    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
--    return false;                                                               \
--  }
-+// #define RTC_CATCH_END2_FALSE(scene)                                             \
-+//   } catch (std::bad_alloc&) {                                                   \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-+//     return false;                                                               \
-+//   } catch (rtcore_error& e) {                                                   \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,e.error,e.what());                             \
-+//     return false;                                                               \
-+//   } catch (std::exception& e) {                                                 \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-+//     return false;                                                               \
-+//   } catch (...) {                                                               \
-+//     Device* device = scene ? scene->device : nullptr;                           \
-+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-+//     return false;                                                               \
-+//   }
 +#define RTC_CATCH_END2_FALSE(scene) return false;
++
++#if 0
+ #define RTC_CATCH_BEGIN try {
+   
+ #define RTC_CATCH_END(device)                                                \
+@@ -71,6 +78,8 @@ namespace embree
+     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+     return false;                                                               \
+   }
++#endif
 +// -- GODOT end --
  
  #define RTC_VERIFY_HANDLE(handle)                               \
    if (handle == nullptr) {                                         \
-@@ -97,28 +103,38 @@ namespace embree
+@@ -97,6 +106,8 @@ namespace embree
  #define RTC_TRACE(x) 
  #endif
  
--  /*! used to throw embree API errors */
--  struct rtcore_error : public std::exception
--  {
--    __forceinline rtcore_error(RTCError error, const std::string& str)
--      : error(error), str(str) {}
--    
--    ~rtcore_error() throw() {}
--    
--    const char* what () const throw () {
--      return str.c_str();
--    }
--    
--    RTCError error;
--    std::string str;
--  };
-+// -- GODOT begin --
-+//   /*! used to throw embree API errors */
-+//   struct rtcore_error : public std::exception
-+//   {
-+//     __forceinline rtcore_error(RTCError error, const std::string& str)
-+//       : error(error), str(str) {}
-+//     
-+//     ~rtcore_error() throw() {}
-+//     
-+//     const char* what () const throw () {
-+//       return str.c_str();
-+//     }
-+//     
-+//     RTCError error;
-+//     std::string str;
-+//   };
-+// -- GODOT end --
++// -- GODOT start --
++#if 0
+   /*! used to throw embree API errors */
+   struct rtcore_error : public std::exception
+   {
+@@ -112,14 +123,18 @@ namespace embree
+     RTCError error;
+     std::string str;
+   };
++#endif
  
  #if defined(DEBUG) // only report file and line in debug mode
-+  // -- GODOT begin --
-+  // #define throw_RTCError(error,str) \
-+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
    #define throw_RTCError(error,str) \
 -    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
 +    printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
-+  // -- GODOT end --
++    // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
  #else
-+  // -- GODOT begin --
-+  // #define throw_RTCError(error,str) \
-+  //   throw rtcore_error(error,str);
    #define throw_RTCError(error,str) \
 -    throw rtcore_error(error,str);
 +    abort();
-+  // -- GODOT end --
++    // throw rtcore_error(error,str);
  #endif
++// -- GODOT end --
  
  #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+   (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
 diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
-index 0149055f2c..408d7eae6f 100644
+index ad1916c54e..65d31d0f81 100644
 --- a/thirdparty/embree/kernels/common/scene.cpp
 +++ b/thirdparty/embree/kernels/common/scene.cpp
-@@ -792,16 +792,18 @@ namespace embree
+@@ -790,16 +790,18 @@ namespace embree
      }
  
      /* initiate build */
diff --git a/thirdparty/embree/patches/godot-changes-ubsan.patch b/thirdparty/embree/patches/godot-changes-ubsan.patch
deleted file mode 100644
index 1336246f0d..0000000000
--- a/thirdparty/embree/patches/godot-changes-ubsan.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
-index bb4fc81dfe..d279dc4993 100644
---- a/thirdparty/embree/kernels/builders/primrefgen.cpp
-+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
-@@ -184,6 +184,9 @@ namespace embree
- 
-     // special variants for grid meshes
- 
-+// -- GODOT start --
-+#if defined(EMBREE_GEOMETRY_GRID)
-+// -- GODOT end --
-     PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
-     {
-       PrimInfo pinfo(empty);
-@@ -293,6 +296,9 @@ namespace embree
- 
-       return pinfo;
-     }
-+// -- GODOT start --
-+#endif
-+// -- GODOT end --
-     
-     // ====================================================================================================
-     // ====================================================================================================