summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/common/stack_item.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/common/stack_item.h')
-rw-r--r--thirdparty/embree-aarch64/kernels/common/stack_item.h125
1 files changed, 125 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h
new file mode 100644
index 0000000000..533c385365
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /*! An item on the stack holds the node ID and distance of that node. */
+ template<typename T>
+ struct __aligned(16) StackItemT
+ {
+ /*! assert that the xchg function works */
+ static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+ __forceinline StackItemT() {}
+
+ __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+ /*! use SSE instructions to swap stack items */
+ __forceinline static void xchg(StackItemT& a, StackItemT& b)
+ {
+ const vfloat4 sse_a = vfloat4::load((float*)&a);
+ const vfloat4 sse_b = vfloat4::load((float*)&b);
+ vfloat4::store(&a,sse_b);
+ vfloat4::store(&b,sse_a);
+ }
+
+ /*! Sort 2 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ }
+
+ /*! Sort 3 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+ {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ if (s3.dist < s2.dist) xchg(s3,s2);
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ }
+
+ /*! Sort 4 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+ {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ if (s4.dist < s3.dist) xchg(s4,s3);
+ if (s3.dist < s1.dist) xchg(s3,s1);
+ if (s4.dist < s2.dist) xchg(s4,s2);
+ if (s3.dist < s2.dist) xchg(s3,s2);
+ }
+
+ /*! use SSE instructions to swap stack items */
+ __forceinline static void cmp_xchg(vint4& a, vint4& b)
+ {
+#if defined(__AVX512VL__)
+ const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+ const vboolf4 mask0(b < a);
+ const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+ const vint4 c = select(mask,b,a);
+ const vint4 d = select(mask,a,b);
+ a = c;
+ b = d;
+ }
+
+ /*! Sort 3 stack items. */
+ __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+ {
+ cmp_xchg(s2,s1);
+ cmp_xchg(s3,s2);
+ cmp_xchg(s2,s1);
+ }
+
+ /*! Sort 4 stack items. */
+ __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+ {
+ cmp_xchg(s2,s1);
+ cmp_xchg(s4,s3);
+ cmp_xchg(s3,s1);
+ cmp_xchg(s4,s2);
+ cmp_xchg(s3,s2);
+ }
+
+
+ /*! Sort N stack items. */
+ __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+ {
+ for (StackItemT* i = begin+1; i != end; ++i)
+ {
+ const vfloat4 item = vfloat4::load((float*)i);
+ const unsigned dist = i->dist;
+ StackItemT* j = i;
+
+ while ((j != begin) && ((j-1)->dist < dist))
+ {
+ vfloat4::store(j, vfloat4::load((float*)(j-1)));
+ --j;
+ }
+
+ vfloat4::store(j, item);
+ }
+ }
+
+ public:
+ T ptr;
+ unsigned dist;
+ };
+
+ /*! An item on the stack holds the node ID and active ray mask. */
+ template<typename T>
+ struct __aligned(8) StackItemMaskT
+ {
+ T ptr;
+ size_t mask;
+ };
+
+ struct __aligned(8) StackItemMaskCoherent
+ {
+ size_t mask;
+ size_t parent;
+ size_t child;
+ };
+}