summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/common')
-rw-r--r--thirdparty/embree/common/algorithms/parallel_any_of.h55
-rw-r--r--thirdparty/embree/common/algorithms/parallel_filter.h93
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for.h186
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for_for.h149
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h112
-rw-r--r--thirdparty/embree/common/algorithms/parallel_map.h85
-rw-r--r--thirdparty/embree/common/algorithms/parallel_partition.h283
-rw-r--r--thirdparty/embree/common/algorithms/parallel_prefix_sum.h85
-rw-r--r--thirdparty/embree/common/algorithms/parallel_reduce.h150
-rw-r--r--thirdparty/embree/common/algorithms/parallel_set.h52
-rw-r--r--thirdparty/embree/common/algorithms/parallel_sort.h454
-rw-r--r--thirdparty/embree/common/lexers/parsestream.h101
-rw-r--r--thirdparty/embree/common/lexers/stream.h215
-rw-r--r--thirdparty/embree/common/lexers/streamfilters.h39
-rw-r--r--thirdparty/embree/common/lexers/stringstream.cpp51
-rw-r--r--thirdparty/embree/common/lexers/stringstream.h29
-rw-r--r--thirdparty/embree/common/lexers/tokenstream.cpp181
-rw-r--r--thirdparty/embree/common/lexers/tokenstream.h164
-rw-r--r--thirdparty/embree/common/math/affinespace.h361
-rw-r--r--thirdparty/embree/common/math/bbox.h331
-rw-r--r--thirdparty/embree/common/math/col3.h47
-rw-r--r--thirdparty/embree/common/math/col4.h47
-rw-r--r--thirdparty/embree/common/math/color.h241
-rw-r--r--thirdparty/embree/common/math/constants.cpp27
-rw-r--r--thirdparty/embree/common/math/constants.h197
-rw-r--r--thirdparty/embree/common/math/interval.h161
-rw-r--r--thirdparty/embree/common/math/lbbox.h289
-rw-r--r--thirdparty/embree/common/math/linearspace2.h148
-rw-r--r--thirdparty/embree/common/math/linearspace3.h213
-rw-r--r--thirdparty/embree/common/math/math.h369
-rw-r--r--thirdparty/embree/common/math/obbox.h39
-rw-r--r--thirdparty/embree/common/math/quaternion.h254
-rw-r--r--thirdparty/embree/common/math/range.h137
-rw-r--r--thirdparty/embree/common/math/transcendental.h525
-rw-r--r--thirdparty/embree/common/math/vec2.h235
-rw-r--r--thirdparty/embree/common/math/vec2fa.h301
-rw-r--r--thirdparty/embree/common/math/vec3.h337
-rw-r--r--thirdparty/embree/common/math/vec3ba.h120
-rw-r--r--thirdparty/embree/common/math/vec3fa.h727
-rw-r--r--thirdparty/embree/common/math/vec3ia.h186
-rw-r--r--thirdparty/embree/common/math/vec4.h243
-rw-r--r--thirdparty/embree/common/simd/arm/emulation.h50
-rw-r--r--thirdparty/embree/common/simd/arm/sse2neon.h6996
-rw-r--r--thirdparty/embree/common/simd/avx.h34
-rw-r--r--thirdparty/embree/common/simd/avx512.h41
-rw-r--r--thirdparty/embree/common/simd/simd.h110
-rw-r--r--thirdparty/embree/common/simd/sse.cpp34
-rw-r--r--thirdparty/embree/common/simd/sse.h35
-rw-r--r--thirdparty/embree/common/simd/varying.h145
-rw-r--r--thirdparty/embree/common/simd/vboold4_avx.h169
-rw-r--r--thirdparty/embree/common/simd/vboold4_avx512.h156
-rw-r--r--thirdparty/embree/common/simd/vboold8_avx512.h151
-rw-r--r--thirdparty/embree/common/simd/vboolf16_avx512.h153
-rw-r--r--thirdparty/embree/common/simd/vboolf4_avx512.h159
-rw-r--r--thirdparty/embree/common/simd/vboolf4_sse2.h189
-rw-r--r--thirdparty/embree/common/simd/vboolf8_avx.h202
-rw-r--r--thirdparty/embree/common/simd/vboolf8_avx512.h159
-rw-r--r--thirdparty/embree/common/simd/vdouble4_avx.h321
-rw-r--r--thirdparty/embree/common/simd/vdouble8_avx512.h351
-rw-r--r--thirdparty/embree/common/simd/vfloat16_avx512.h615
-rw-r--r--thirdparty/embree/common/simd/vfloat4_sse2.h722
-rw-r--r--thirdparty/embree/common/simd/vfloat8_avx.h758
-rw-r--r--thirdparty/embree/common/simd/vint16_avx512.h472
-rw-r--r--thirdparty/embree/common/simd/vint4_sse2.h598
-rw-r--r--thirdparty/embree/common/simd/vint8_avx.h470
-rw-r--r--thirdparty/embree/common/simd/vint8_avx2.h518
-rw-r--r--thirdparty/embree/common/simd/vllong4_avx2.h352
-rw-r--r--thirdparty/embree/common/simd/vllong8_avx512.h358
-rw-r--r--thirdparty/embree/common/simd/vuint16_avx512.h424
-rw-r--r--thirdparty/embree/common/simd/vuint4_sse2.h426
-rw-r--r--thirdparty/embree/common/simd/vuint8_avx.h386
-rw-r--r--thirdparty/embree/common/simd/vuint8_avx2.h446
-rw-r--r--thirdparty/embree/common/sys/alloc.cpp327
-rw-r--r--thirdparty/embree/common/sys/alloc.h164
-rw-r--r--thirdparty/embree/common/sys/array.h222
-rw-r--r--thirdparty/embree/common/sys/atomic.h59
-rw-r--r--thirdparty/embree/common/sys/barrier.cpp289
-rw-r--r--thirdparty/embree/common/sys/barrier.h112
-rw-r--r--thirdparty/embree/common/sys/condition.cpp85
-rw-r--r--thirdparty/embree/common/sys/condition.h31
-rw-r--r--thirdparty/embree/common/sys/filename.cpp138
-rw-r--r--thirdparty/embree/common/sys/filename.h81
-rw-r--r--thirdparty/embree/common/sys/intrinsics.h525
-rw-r--r--thirdparty/embree/common/sys/library.cpp83
-rw-r--r--thirdparty/embree/common/sys/library.h21
-rw-r--r--thirdparty/embree/common/sys/mutex.cpp57
-rw-r--r--thirdparty/embree/common/sys/mutex.h98
-rw-r--r--thirdparty/embree/common/sys/platform.h392
-rw-r--r--thirdparty/embree/common/sys/ref.h122
-rw-r--r--thirdparty/embree/common/sys/regression.cpp30
-rw-r--r--thirdparty/embree/common/sys/regression.h25
-rw-r--r--thirdparty/embree/common/sys/string.cpp42
-rw-r--r--thirdparty/embree/common/sys/string.h37
-rw-r--r--thirdparty/embree/common/sys/sysinfo.cpp656
-rw-r--r--thirdparty/embree/common/sys/sysinfo.h178
-rw-r--r--thirdparty/embree/common/sys/thread.cpp474
-rw-r--r--thirdparty/embree/common/sys/thread.h49
-rw-r--r--thirdparty/embree/common/sys/vector.h242
-rw-r--r--thirdparty/embree/common/tasking/taskscheduler.h15
-rw-r--r--thirdparty/embree/common/tasking/taskschedulerinternal.cpp420
-rw-r--r--thirdparty/embree/common/tasking/taskschedulerinternal.h385
-rw-r--r--thirdparty/embree/common/tasking/taskschedulerppl.h46
-rw-r--r--thirdparty/embree/common/tasking/taskschedulertbb.h73
103 files changed, 29497 insertions, 0 deletions
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..a64e4a1889
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+
+ template<typename Index, class UnaryPredicate>
+ __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+ {
+ bool ret = false;
+
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+ if (context.is_group_execution_cancelled()) return;
+ for (size_t i = r.begin(); i != r.end(); ++i) {
+ if (pred(i)) {
+ ret = true;
+ context.cancel_group_execution();
+ }
+ }
+ });
+#else
+ tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+ if (tbb::task::self().is_cancelled()) return;
+ for (size_t i = r.begin(); i != r.end(); ++i) {
+ if (pred(i)) {
+ ret = true;
+ tbb::task::self().cancel_group_execution();
+ }
+ }
+ });
+#endif
+#else
+ ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+ bool localret = false;
+ for (auto i=r.begin(); i<r.end(); ++i) {
+ localret |= pred(i);
+ }
+ return localret;
+ },
+ std::bit_or<bool>()
+ );
+#endif
+
+ return ret;
+ }
+
+} // end namespace
diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..090ef164c2
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Ty, typename Index, typename Predicate>
+ inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+ {
+ Index j = first;
+ for (Index i=first; i<last; i++)
+ if (predicate(data[i]))
+ data[j++] = data[i];
+
+ return j;
+ }
+
+ template<typename Ty, typename Index, typename Predicate>
+ inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+ {
+ /* sequential fallback */
+ if (end-begin <= minStepSize)
+ return sequential_filter(data,begin,end,predicate);
+
+ /* calculate number of tasks to use */
+ enum { MAX_TASKS = 64 };
+ const Index numThreads = TaskScheduler::threadCount();
+ const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
+ const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+ /* filter blocks */
+ Index nused[MAX_TASKS];
+ Index nfree[MAX_TASKS];
+ parallel_for(taskCount, [&](const Index taskIndex)
+ {
+ const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+ const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+ const Index i2 = sequential_filter(data,i0,i1,predicate);
+ nused[taskIndex] = i2-i0;
+ nfree[taskIndex] = i1-i2;
+ });
+
+ /* calculate offsets */
+ Index sused=0;
+ Index sfree=0;
+ Index pfree[MAX_TASKS];
+ for (Index i=0; i<taskCount; i++)
+ {
+ sused+=nused[i];
+ Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+ }
+
+ /* return if we did not filter out any element */
+ assert(sfree <= end-begin);
+ assert(sused <= end-begin);
+ if (sused == end-begin)
+ return end;
+
+ /* otherwise we have to copy misplaced elements around */
+ parallel_for(taskCount, [&](const Index taskIndex)
+ {
+ /* destination to write elements to */
+ Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+ Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+ if (dst_end <= dst) return;
+
+ /* range of misplaced elements to copy to destination */
+ Index r0 = pfree[taskIndex];
+ Index r1 = r0+dst_end-dst;
+
+ /* find range in misplaced elements in back to front order */
+ Index k0=0;
+ for (Index i=taskCount-1; i>0; i--)
+ {
+ if (k0 > r1) break;
+ Index k1 = k0+nused[i];
+ Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+ for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+ Index isrc = src-i+k0-1;
+ assert(dst >= begin && dst < end);
+ assert(isrc >= begin && isrc < end);
+ data[dst++] = data[isrc];
+ }
+ k0 = k1;
+ }
+ });
+
+ return begin+sused;
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..645681ac63
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+namespace embree
+{
+ /* parallel_for without range */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index N, const Func& func)
+ {
+#if defined(TASKING_INTERNAL)
+ if (N) {
+ TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+ assert(r.size() == 1);
+ func(r.begin());
+ });
+ if (!TaskScheduler::wait())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ }
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ });
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+
+#elif defined(TASKING_PPL)
+ concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ });
+#else
+# error "no tasking system enabled"
+#endif
+ }
+
+ /* parallel for with range and granulatity */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+ {
+ assert(first <= last);
+#if defined(TASKING_INTERNAL)
+ TaskScheduler::spawn(first,last,minStepSize,func);
+ if (!TaskScheduler::wait())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+ func(range<Index>(r.begin(),r.end()));
+ },context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+ func(range<Index>(r.begin(),r.end()));
+ });
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+
+#elif defined(TASKING_PPL)
+ concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) {
+ func(range<Index>(i,i+1));
+ });
+
+#else
+# error "no tasking system enabled"
+#endif
+ }
+
+ /* parallel for with range */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+ {
+ assert(first <= last);
+ parallel_for(first,last,(Index)1,func);
+ }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_static( const Index N, const Func& func)
+ {
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },tbb::simple_partitioner(),context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },tbb::simple_partitioner());
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+ }
+
+ typedef tbb::affinity_partitioner affinity_partitioner;
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+ {
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },ap,context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },ap);
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+ }
+
+#else
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_static( const Index N, const Func& func)
+ {
+ parallel_for(N,func);
+ }
+
+ struct affinity_partitioner {
+ };
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap)
+ {
+ parallel_for(N,func);
+ }
+
+#endif
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..92c37a4a38
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename ArrayArray, typename Func>
+ __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+ {
+ size_t k=0;
+ for (size_t i=0; i!=array2.size(); ++i) {
+ const size_t N = array2[i]->size();
+ if (N) func(array2[i],range<size_t>(0,N),k);
+ k+=N;
+ }
+ }
+
+ class ParallelForForState
+ {
+ public:
+
+ enum { MAX_TASKS = 64 };
+
+ __forceinline ParallelForForState ()
+ : taskCount(0) {}
+
+ template<typename ArrayArray>
+ __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+ init(array2,minStepSize);
+ }
+
+ template<typename ArrayArray>
+ __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+ {
+ /* first calculate total number of elements */
+ size_t N = 0;
+ for (size_t i=0; i<array2.size(); i++) {
+ N += array2[i] ? array2[i]->size() : 0;
+ }
+ this->N = N;
+
+ /* calculate number of tasks to use */
+ const size_t numThreads = TaskScheduler::threadCount();
+ const size_t numBlocks = (N+minStepSize-1)/minStepSize;
+ taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+
+ /* calculate start (i,j) for each task */
+ size_t taskIndex = 0;
+ i0[taskIndex] = 0;
+ j0[taskIndex] = 0;
+ size_t k0 = (++taskIndex)*N/taskCount;
+ for (size_t i=0, k=0; taskIndex < taskCount; i++)
+ {
+ assert(i<array2.size());
+ size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+ while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+ assert(taskIndex<taskCount);
+ i0[taskIndex] = i;
+ j0[taskIndex] = j += k0-k;
+ k=k0;
+ k0 = (++taskIndex)*N/taskCount;
+ }
+ k+=M-j;
+ }
+ }
+
+ __forceinline size_t size() const {
+ return N;
+ }
+
+ public:
+ size_t i0[MAX_TASKS];
+ size_t j0[MAX_TASKS];
+ size_t taskCount;
+ size_t N;
+ };
+
+ template<typename ArrayArray, typename Func>
+ __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+ {
+ ParallelForForState state(array2,minStepSize);
+
+ parallel_for(state.taskCount, [&](const size_t taskIndex)
+ {
+ /* calculate range */
+ const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t N = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(N,r0+k1-k);
+ if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+ k+=r1-r0; j0 = 0;
+ }
+ });
+ }
+
+ template<typename ArrayArray, typename Func>
+ __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+ {
+ parallel_for_for(array2,1,func);
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ ParallelForForState state(array2,minStepSize);
+ Value temp[ParallelForForState::MAX_TASKS];
+
+ for (size_t i=0; i<state.taskCount; i++)
+ temp[i] = identity;
+
+ parallel_for(state.taskCount, [&](const size_t taskIndex)
+ {
+ /* calculate range */
+ const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t N = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(N,r0+k1-k);
+ if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+ k+=r1-r0; j0 = 0;
+ }
+ });
+
+ Value ret = identity;
+ for (size_t i=0; i<state.taskCount; i++)
+ ret = reduction(ret,temp[i]);
+ return ret;
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_reduce(array2,1,identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..b15b44a991
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+ template<typename Value>
+ struct ParallelForForPrefixSumState : public ParallelForForState
+ {
+ __forceinline ParallelForForPrefixSumState () {}
+
+ template<typename ArrayArray>
+ __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+ : ParallelForForState(array2,minStepSize) {}
+
+ ParallelPrefixSumState<Value> prefix_state;
+ };
+
+ template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t taskCount = state.taskCount;
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ Value N=identity;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t size = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(size,r0+k1-k);
+ if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+ k+=r1-r0; j0 = 0;
+ }
+ state.prefix_state.counts[taskIndex] = N;
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.prefix_state.counts[i];
+ state.prefix_state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t taskCount = state.taskCount;
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ Value N=identity;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t size = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(size,r0+k1-k);
+ if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+ k+=r1-r0; j0 = 0;
+ }
+ state.prefix_state.counts[taskIndex] = N;
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.prefix_state.counts[i];
+ state.prefix_state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..15c098fe20
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+ /*! implementation of a key/value map with parallel construction */
+ template<typename Key, typename Val>
+ class parallel_map
+ {
+ /* key/value pair to build the map */
+ struct KeyValue
+ {
+ __forceinline KeyValue () {}
+
+ __forceinline KeyValue (const Key key, const Val val)
+ : key(key), val(val) {}
+
+ __forceinline operator Key() const {
+ return key;
+ }
+
+ public:
+ Key key;
+ Val val;
+ };
+
+ public:
+
+ /*! parallel map constructors */
+ parallel_map () {}
+
+ /*! construction from pair of vectors */
+ template<typename KeyVector, typename ValVector>
+ parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+ /*! initialized the parallel map from a vector with keys and values */
+ template<typename KeyVector, typename ValVector>
+ void init(const KeyVector& keys, const ValVector& values)
+ {
+ /* reserve sufficient space for all data */
+ assert(keys.size() == values.size());
+ vec.resize(keys.size());
+
+ /* generate key/value pairs */
+ parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ vec[i] = KeyValue((Key)keys[i],values[i]);
+ });
+
+ /* perform parallel radix sort of the key/value pairs */
+ std::vector<KeyValue> temp(keys.size());
+ radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+ }
+
+ /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+ __forceinline const Val* lookup(const Key& key) const
+ {
+ typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+ if (i == vec.end()) return nullptr;
+ if (i->key != key) return nullptr;
+ return &i->val;
+ }
+
+ /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+ __forceinline Val lookup(const Key& key, const Val& def) const
+ {
+ typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+ if (i == vec.end()) return def;
+ if (i->key != key) return def;
+ return i->val;
+ }
+
+ /*! clears all state */
+ void clear() {
+ vec.clear();
+ }
+
+ private:
+ std::vector<KeyValue> vec; //!< vector containing sorted elements
+ };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..a1cbdc8e04
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+ /* serial partitioning */
+ template<typename T, typename V, typename IsLeft, typename Reduction_T>
+ __forceinline size_t serial_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ V& leftReduction,
+ V& rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t)
+ {
+ T* l = array + begin;
+ T* r = array + end - 1;
+
+ while(1)
+ {
+ /* *l < pivot */
+ while (likely(l <= r && is_left(*l) ))
+ {
+ //prefetchw(l+4); // FIXME: enable?
+ reduction_t(leftReduction,*l);
+ ++l;
+ }
+ /* *r >= pivot) */
+ while (likely(l <= r && !is_left(*r)))
+ {
+ //prefetchw(r-4); FIXME: enable?
+ reduction_t(rightReduction,*r);
+ --r;
+ }
+ if (r<l) break;
+
+ reduction_t(leftReduction ,*r);
+ reduction_t(rightReduction,*l);
+ xchg(*l,*r);
+ l++; r--;
+ }
+
+ return l - array;
+ }
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ class __aligned(64) parallel_partition_task
+ {
+ ALIGNED_CLASS_(64);
+ private:
+
+ static const size_t MAX_TASKS = 64;
+
+ T* array;
+ size_t N;
+ const IsLeft& is_left;
+ const Reduction_T& reduction_t;
+ const Reduction_V& reduction_v;
+ const Vi& identity;
+
+ size_t numTasks;
+ __aligned(64) size_t counter_start[MAX_TASKS+1];
+ __aligned(64) size_t counter_left[MAX_TASKS+1];
+ __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
+ __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
+ __aligned(64) V leftReductions[MAX_TASKS];
+ __aligned(64) V rightReductions[MAX_TASKS];
+
+ public:
+
+ __forceinline parallel_partition_task(T* array,
+ const size_t N,
+ const Vi& identity,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ const size_t BLOCK_SIZE)
+
+ : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+ numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+ __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+ {
+ size_t i = 0;
+ while(index >= (size_t)r[i].size())
+ {
+ assert(i < numRanges);
+ index -= (size_t)r[i].size();
+ i++;
+ }
+ return &r[i];
+ }
+
+ __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+ const size_t numRightMisplacedRanges,
+ const size_t startID,
+ const size_t endID)
+ {
+ size_t leftLocalIndex = startID;
+ size_t rightLocalIndex = startID;
+ const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+ const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+
+ size_t l_left = l_range->size() - leftLocalIndex;
+ size_t r_left = r_range->size() - rightLocalIndex;
+ T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+ T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+ size_t size = endID - startID;
+ size_t items = min(size,min(l_left,r_left));
+
+ while (size)
+ {
+ if (unlikely(l_left == 0))
+ {
+ l_range++;
+ l_left = l_range->size();
+ l = &array[l_range->begin()];
+ items = min(size,min(l_left,r_left));
+ }
+
+ if (unlikely(r_left == 0))
+ {
+ r_range++;
+ r_left = r_range->size();
+ r = &array[r_range->begin()];
+ items = min(size,min(l_left,r_left));
+ }
+
+ size -= items;
+ l_left -= items;
+ r_left -= items;
+
+ while(items) {
+ items--;
+ xchg(*l++,*r++);
+ }
+ }
+ }
+
+ __forceinline size_t partition(V& leftReduction, V& rightReduction)
+ {
+ /* partition the individual ranges for each task */
+ parallel_for(numTasks,[&] (const size_t taskID) {
+ const size_t startID = (taskID+0)*N/numTasks;
+ const size_t endID = (taskID+1)*N/numTasks;
+ V local_left(identity);
+ V local_right(identity);
+ const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+ counter_start[taskID] = startID;
+ counter_left [taskID] = mid-startID;
+ leftReductions[taskID] = local_left;
+ rightReductions[taskID] = local_right;
+ });
+ counter_start[numTasks] = N;
+ counter_left[numTasks] = 0;
+
+ /* finalize the reductions */
+ for (size_t i=0; i<numTasks; i++) {
+ reduction_v(leftReduction,leftReductions[i]);
+ reduction_v(rightReduction,rightReductions[i]);
+ }
+
+ /* calculate mid point for partitioning */
+ size_t mid = counter_left[0];
+ for (size_t i=1; i<numTasks; i++)
+ mid += counter_left[i];
+ const range<ssize_t> globalLeft (0,mid);
+ const range<ssize_t> globalRight(mid,N);
+
+ /* calculate all left and right ranges that are on the wrong global side */
+ size_t numMisplacedRangesLeft = 0;
+ size_t numMisplacedRangesRight = 0;
+ size_t numMisplacedItemsLeft = 0;
+ size_t numMisplacedItemsRight = 0;
+
+ for (size_t i=0; i<numTasks; i++)
+ {
+ const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+ const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+ const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
+ const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+ if (!left_misplaced.empty())
+ {
+ numMisplacedItemsLeft += left_misplaced.size();
+ leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+ }
+
+ if (!right_misplaced.empty())
+ {
+ numMisplacedItemsRight += right_misplaced.size();
+ rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+ }
+ }
+ assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+ /* if no items are misplaced we are done */
+ if (numMisplacedItemsLeft == 0)
+ return mid;
+
+ /* otherwise we copy the items to the right place in parallel */
+ parallel_for(numTasks,[&] (const size_t taskID) {
+ const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+ const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
+ swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
+ });
+
+ return mid;
+ }
+ };
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ __noinline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const Vi &identity,
+ V &leftReduction,
+ V &rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ size_t BLOCK_SIZE = 128)
+ {
+ /* fall back to single threaded partitioning for small N */
+ if (unlikely(end-begin < BLOCK_SIZE))
+ return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+ /* otherwise use parallel code */
+ else {
+ typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+ std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+ return begin+p->partition(leftReduction,rightReduction);
+ }
+ }
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ __noinline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const Vi &identity,
+ V &leftReduction,
+ V &rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ size_t BLOCK_SIZE,
+ size_t PARALLEL_THRESHOLD)
+ {
+ /* fall back to single threaded partitioning for small N */
+ if (unlikely(end-begin < PARALLEL_THRESHOLD))
+ return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+ /* otherwise use parallel code */
+ else {
+ typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+ std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+ return begin+p->partition(leftReduction,rightReduction);
+ }
+ }
+
+
+ template<typename T, typename IsLeft>
+ inline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const IsLeft& is_left,
+ size_t BLOCK_SIZE = 128)
+ {
+ size_t leftReduction = 0;
+ size_t rightReduction = 0;
+ return parallel_partitioning(
+ array,begin,end,0,leftReduction,rightReduction,is_left,
+ [] (size_t& t,const T& ref) { },
+ [] (size_t& t0,size_t& t1) { },
+ BLOCK_SIZE);
+ }
+
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..208bb4e480
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Value>
+ struct ParallelPrefixSumState
+ {
+ enum { MAX_TASKS = 64 };
+ Value counts[MAX_TASKS];
+ Value sums [MAX_TASKS];
+ };
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t numThreads = TaskScheduler::threadCount();
+ const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
+ const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+ const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+ state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.counts[i];
+ state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ /*! parallel calculation of prefix sums */
+ template<typename SrcArray, typename DstArray, typename Value, typename Add>
+ __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
+ {
+ /* perform single threaded prefix operation for small N */
+ if (N < SINGLE_THREAD_THRESHOLD)
+ {
+ Value sum=identity;
+ for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+ return sum;
+ }
+
+ /* perform parallel prefix operation for large N */
+ else
+ {
+ ParallelPrefixSumState<Value> state;
+
+ /* initial run just sets up start values for subtasks */
+ parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+
+ Value s = identity;
+ for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+ return s;
+
+ }, add);
+
+ /* final run calculates prefix sum */
+ return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+
+ Value s = identity;
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ dst[i] = add(sum,s);
+ s = add(s,src[i]);
+ }
+ return s;
+
+ }, add);
+ }
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..8271372ea4
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return func(range<Index>(first,last));
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return func(range<Index>(first,last));
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ const Index maxTasks = 512;
+ const Index threadCount = (Index) TaskScheduler::threadCount();
+ taskCount = min(taskCount,threadCount,maxTasks);
+
+ /* parallel invokation of all tasks */
+ dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+ parallel_for(taskCount, [&](const Index taskIndex) {
+ const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+ const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+ values[taskIndex] = func(range<Index>(k0,k1));
+ });
+
+ /* perform reduction over all tasks */
+ Value v = identity;
+ for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+ return v;
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+#if defined(TASKING_INTERNAL)
+
+ /* fast path for small number of iterations */
+ Index taskCount = (last-first+minStepSize-1)/minStepSize;
+ if (likely(taskCount == 1)) {
+ return func(range<Index>(first,last));
+ }
+ return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+ [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+ reduction,context);
+ // -- GODOT start --
+ // if (context.is_group_execution_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
+ return v;
+ #else
+ const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+ [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+ reduction);
+ // -- GODOT start --
+ // if (tbb::task::self().is_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
+ return v;
+ #endif
+#else // TASKING_PPL
+ struct AlignedValue
+ {
+ char storage[__alignof(Value)+sizeof(Value)];
+ static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+ Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+ const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+ AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+ AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+ AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+ AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+ AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+ operator Value() const { return *getValuePtr(); }
+ };
+
+ struct Iterator_Index
+ {
+ Index v;
+ typedef std::forward_iterator_tag iterator_category;
+ typedef AlignedValue value_type;
+ typedef Index difference_type;
+ typedef Index distance_type;
+ typedef AlignedValue* pointer;
+ typedef AlignedValue& reference;
+ __forceinline Iterator_Index() {}
+ __forceinline Iterator_Index(Index v) : v(v) {}
+ __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+ __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+ __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+ __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+ };
+
+ auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+ assert(begin.v < end.v);
+ return reduction(start, func(range<Index>(begin.v, end.v)));
+ };
+ const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+ return v;
+#endif
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ if (likely(last-first < parallel_threshold)) {
+ return func(range<Index>(first,last));
+ } else {
+ return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+ }
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ auto funcr = [&] ( const range<Index> r ) {
+ Value v = identity;
+ for (Index i=r.begin(); i<r.end(); i++)
+ v = reduction(v,func(i));
+ return v;
+ };
+ return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..7eae577457
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+ /* implementation of a set of values with parallel construction */
+ template<typename T>
+ class parallel_set
+ {
+ public:
+
+ /*! default constructor for the parallel set */
+ parallel_set () {}
+
+ /*! construction from vector */
+ template<typename Vector>
+ parallel_set (const Vector& in) { init(in); }
+
+ /*! initialized the parallel set from a vector */
+ template<typename Vector>
+ void init(const Vector& in)
+ {
+ /* copy data to internal vector */
+ vec.resize(in.size());
+ parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ vec[i] = in[i];
+ });
+
+ /* sort the data */
+ std::vector<T> temp(in.size());
+ radix_sort<T>(vec.data(),temp.data(),vec.size());
+ }
+
+ /*! tests if some element is in the set */
+ __forceinline bool lookup(const T& elt) const {
+ return std::binary_search(vec.begin(), vec.end(), elt);
+ }
+
+ /*! clears all state */
+ void clear() {
+ vec.clear();
+ }
+
+ private:
+ std::vector<T> vec; //!< vector containing sorted elements
+ };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..30e56c2bfc
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -0,0 +1,454 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#include <algorithm>
+
+namespace embree
+{
+ template<class T>
+ __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+ {
+ for(size_t i = 1;i<length;++i)
+ {
+ T v = array[i];
+ size_t j = i;
+ while(j > 0 && v < array[j-1])
+ {
+ array[j] = array[j-1];
+ --j;
+ }
+ array[j] = v;
+ }
+ }
+
+ template<class T>
+ __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+ {
+ for(size_t i = 1;i<length;++i)
+ {
+ T v = array[i];
+ size_t j = i;
+ while(j > 0 && v > array[j-1])
+ {
+ array[j] = array[j-1];
+ --j;
+ }
+ array[j] = v;
+ }
+ }
+
+ template<class T>
+ void quicksort_ascending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] > pivotvalue);
+ while (t[++left] < pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const int pivot = right;
+ quicksort_ascending(t, begin, pivot);
+ quicksort_ascending(t, pivot + 1, end);
+ }
+ }
+
+ template<class T>
+ void quicksort_decending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] < pivotvalue);
+ while (t[++left] > pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const int pivot = right;
+ quicksort_decending(t, begin, pivot);
+ quicksort_decending(t, pivot + 1, end);
+ }
+ }
+
+
+ template<class T, ssize_t THRESHOLD>
+ void quicksort_insertionsort_ascending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const ssize_t size = end-begin+1;
+ if (likely(size <= THRESHOLD))
+ {
+ insertionsort_ascending<T>(&t[begin],size);
+ }
+ else
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] > pivotvalue);
+ while (t[++left] < pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const ssize_t pivot = right;
+ quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+ quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+ }
+ }
+ }
+
+
+ template<class T, ssize_t THRESHOLD>
+ void quicksort_insertionsort_decending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const ssize_t size = end-begin+1;
+ if (likely(size <= THRESHOLD))
+ {
+ insertionsort_decending<T>(&t[begin],size);
+ }
+ else
+ {
+
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] < pivotvalue);
+ while (t[++left] > pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const ssize_t pivot = right;
+ quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+ quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+ }
+ }
+ }
+
+ template<typename T>
+ static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+ {
+ static const unsigned int BITS = 8;
+ static const unsigned int BUCKETS = (1 << BITS);
+ static const unsigned int CMP_SORT_THRESHOLD = 16;
+
+ __aligned(64) unsigned int count[BUCKETS];
+
+ /* clear buckets */
+ for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+
+ /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=0;i<num;i++)
+ count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+
+ /* prefix sums */
+ __aligned(64) unsigned int head[BUCKETS];
+ __aligned(64) unsigned int tail[BUCKETS];
+
+ head[0] = 0;
+ for (size_t i=1; i<BUCKETS; i++)
+ head[i] = head[i-1] + count[i-1];
+
+ for (size_t i=0; i<BUCKETS-1; i++)
+ tail[i] = head[i+1];
+
+ tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+
+ assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
+ assert(tail[BUCKETS-1] == num);
+
+ /* in-place swap */
+ for (size_t i=0;i<BUCKETS;i++)
+ {
+ /* process bucket */
+ while(head[i] < tail[i])
+ {
+ T v = morton[head[i]];
+ while(1)
+ {
+ const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+ if (b == i) break;
+ std::swap(v,morton[head[b]++]);
+ }
+ assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+ morton[head[i]++] = v;
+ }
+ }
+ if (shift == 0) return;
+
+ size_t offset = 0;
+ for (size_t i=0;i<BUCKETS;i++)
+ if (count[i])
+ {
+
+ for (size_t j=offset;j<offset+count[i]-1;j++)
+ assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+
+ if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+ insertionsort_ascending(morton + offset, count[i]);
+ else
+ radixsort32(morton + offset, count[i], shift-BITS);
+
+ for (size_t j=offset;j<offset+count[i]-1;j++)
+ assert(morton[j] <= morton[j+1]);
+
+ offset += count[i];
+ }
+ }
+
+ template<typename Ty, typename Key>
+ class ParallelRadixSort
+ {
+ static const size_t MAX_TASKS = 64;
+ static const size_t BITS = 8;
+ static const size_t BUCKETS = (1 << BITS);
+ typedef unsigned int TyRadixCount[BUCKETS];
+
+ template<typename T>
+ static bool compare(const T& v0, const T& v1) {
+ return (Key)v0 < (Key)v1;
+ }
+
+ private:
+ ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+ ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+
+ public:
+ ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+ : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+ void sort(const size_t blockSize)
+ {
+ assert(blockSize > 0);
+
+ /* perform single threaded sort for small N */
+ if (N<=blockSize) // handles also special case of 0!
+ {
+ /* do inplace sort inside destination array */
+ std::sort(src,src+N,compare<Ty>);
+ }
+
+ /* perform parallel sort for large N */
+ else
+ {
+ const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+ tbbRadixSort(numThreads);
+ }
+ }
+
+ ~ParallelRadixSort()
+ {
+ alignedFree(radixCount);
+ radixCount = nullptr;
+ }
+
+ private:
+
+ void tbbRadixIteration0(const Key shift,
+ const Ty* __restrict const src,
+ Ty* __restrict const dst,
+ const size_t threadIndex, const size_t threadCount)
+ {
+ const size_t startID = (threadIndex+0)*N/threadCount;
+ const size_t endID = (threadIndex+1)*N/threadCount;
+
+ /* mask to extract some number of bits */
+ const Key mask = BUCKETS-1;
+
+ /* count how many items go into the buckets */
+ for (size_t i=0; i<BUCKETS; i++)
+ radixCount[threadIndex][i] = 0;
+
+ /* iterate over src array and count buckets */
+ unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=startID; i<endID; i++) {
+#if defined(__64BIT__)
+ const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+ const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+ count[index]++;
+ }
+ }
+
+ void tbbRadixIteration1(const Key shift,
+ const Ty* __restrict const src,
+ Ty* __restrict const dst,
+ const size_t threadIndex, const size_t threadCount)
+ {
+ const size_t startID = (threadIndex+0)*N/threadCount;
+ const size_t endID = (threadIndex+1)*N/threadCount;
+
+ /* mask to extract some number of bits */
+ const Key mask = BUCKETS-1;
+
+ /* calculate total number of items for each bucket */
+ __aligned(64) unsigned int total[BUCKETS];
+ /*
+ for (size_t i=0; i<BUCKETS; i++)
+ total[i] = 0;
+ */
+ for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+ vintx::store(&total[i], zero);
+
+ for (size_t i=0; i<threadCount; i++)
+ {
+ /*
+ for (size_t j=0; j<BUCKETS; j++)
+ total[j] += radixCount[i][j];
+ */
+ for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+ vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+ }
+
+ /* calculate start offset of each bucket */
+ __aligned(64) unsigned int offset[BUCKETS];
+ offset[0] = 0;
+ for (size_t i=1; i<BUCKETS; i++)
+ offset[i] = offset[i-1] + total[i-1];
+
+ /* calculate start offset of each bucket for this thread */
+ for (size_t i=0; i<threadIndex; i++)
+ {
+ /*
+ for (size_t j=0; j<BUCKETS; j++)
+ offset[j] += radixCount[i][j];
+ */
+ for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+ vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+ }
+
+ /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=startID; i<endID; i++) {
+ const Ty elt = src[i];
+#if defined(__64BIT__)
+ const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+ const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+ dst[offset[index]++] = elt;
+ }
+ }
+
+ void tbbRadixIteration(const Key shift, const bool last,
+ const Ty* __restrict src, Ty* __restrict dst,
+ const size_t numTasks)
+ {
+ affinity_partitioner ap;
+ parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+ parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+ }
+
+ void tbbRadixSort(const size_t numTasks)
+ {
+ radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+
+ if (sizeof(Key) == sizeof(uint32_t)) {
+ tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+ }
+ else if (sizeof(Key) == sizeof(uint64_t))
+ {
+ tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+ }
+ }
+
+ private:
+ TyRadixCount* radixCount;
+ Ty* const src;
+ Ty* const tmp;
+ const size_t N;
+ };
+
+ template<typename Ty>
+ void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+ {
+ ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+ }
+
+ template<typename Ty, typename Key>
+ void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+ {
+ ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+ }
+
+ template<typename Ty>
+ void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+ radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+ }
+
+ template<typename Ty>
+ void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+ radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+ }
+}
diff --git a/thirdparty/embree/common/lexers/parsestream.h b/thirdparty/embree/common/lexers/parsestream.h
new file mode 100644
index 0000000000..f65a52cb47
--- /dev/null
+++ b/thirdparty/embree/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+ /*! helper class for simple command line parsing */
+ class ParseStream : public Stream<std::string>
+ {
+ public:
+ ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+ ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+ const std::string& endl = "", bool multiLine = false)
+ : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+ public:
+ ParseLocation location() { return cin->loc(); }
+ std::string next() { return cin->get(); }
+
+ void force(const std::string& next) {
+ std::string token = getString();
+ if (token != next)
+ THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+ }
+
+ std::string getString() {
+ return get();
+ }
+
+ FileName getFileName() {
+ return FileName(get());
+ }
+
+ int getInt () {
+ return atoi(get().c_str());
+ }
+
+ Vec2i getVec2i() {
+ int x = atoi(get().c_str());
+ int y = atoi(get().c_str());
+ return Vec2i(x,y);
+ }
+
+ Vec3ia getVec3ia() {
+ int x = atoi(get().c_str());
+ int y = atoi(get().c_str());
+ int z = atoi(get().c_str());
+ return Vec3ia(x,y,z);
+ }
+
+ float getFloat() {
+ return (float)atof(get().c_str());
+ }
+
+ Vec2f getVec2f() {
+ float x = (float)atof(get().c_str());
+ float y = (float)atof(get().c_str());
+ return Vec2f(x,y);
+ }
+
+ Vec3f getVec3f() {
+ float x = (float)atof(get().c_str());
+ float y = (float)atof(get().c_str());
+ float z = (float)atof(get().c_str());
+ return Vec3f(x,y,z);
+ }
+
+ Vec3fa getVec3fa() {
+ float x = (float)atof(get().c_str());
+ float y = (float)atof(get().c_str());
+ float z = (float)atof(get().c_str());
+ return Vec3fa(x,y,z);
+ }
+
+ Col3f getCol3f() {
+ float x = (float)atof(get().c_str());
+ float y = (float)atof(get().c_str());
+ float z = (float)atof(get().c_str());
+ return Col3f(x,y,z);
+ }
+
+ Color getColor() {
+ float r = (float)atof(get().c_str());
+ float g = (float)atof(get().c_str());
+ float b = (float)atof(get().c_str());
+ return Color(r,g,b);
+ }
+
+ private:
+ Ref<Stream<std::string> > cin;
+ };
+}
diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h
new file mode 100644
index 0000000000..a40c15f8eb
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+ /*! stores the location of a stream element in the source */
+ class ParseLocation
+ {
+ public:
+ ParseLocation () : lineNumber(-1), colNumber(-1) {}
+ ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+ : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+ std::string str() const
+ {
+ std::string str = "unknown";
+ if (fileName) str = *fileName;
+ if (lineNumber >= 0) str += " line " + toString(lineNumber);
+ if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+ return str;
+ }
+
+ private:
+ std::shared_ptr<std::string> fileName; /// name of the file (or stream) the token is from
+ ssize_t lineNumber; /// the line number the token is from
+ ssize_t colNumber; /// the character number in the current line
+ };
+
+ /*! a stream class templated over the stream elements */
+ template<typename T> class Stream : public RefCount
+ {
+ enum { BUF_SIZE = 1024 };
+
+ private:
+ virtual T next() = 0;
+ virtual ParseLocation location() = 0;
+ __forceinline std::pair<T,ParseLocation> nextHelper() {
+ ParseLocation l = location();
+ T v = next();
+ return std::pair<T,ParseLocation>(v,l);
+ }
+ __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+ if (past+future == BUF_SIZE) pop_front();
+ size_t end = (start+past+future++)%BUF_SIZE;
+ buffer[end] = v;
+ }
+ __forceinline void pop_front() {
+ if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+ start = (start+1)%BUF_SIZE; past--;
+ }
+ public:
+ Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+ virtual ~Stream() {}
+
+ public:
+
+ const ParseLocation& loc() {
+ if (future == 0) push_back(nextHelper());
+ return buffer[(start+past)%BUF_SIZE].second;
+ }
+ T get() {
+ if (future == 0) push_back(nextHelper());
+ T t = buffer[(start+past)%BUF_SIZE].first;
+ past++; future--;
+ return t;
+ }
+ const T& peek() {
+ if (future == 0) push_back(nextHelper());
+ return buffer[(start+past)%BUF_SIZE].first;
+ }
+ const T& unget(size_t n = 1) {
+ if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+ past -= n; future += n;
+ return peek();
+ }
+ void drop() {
+ if (future == 0) push_back(nextHelper());
+ past++; future--;
+ }
+ private:
+ size_t start,past,future;
+ std::vector<std::pair<T,ParseLocation> > buffer;
+ };
+
+ /*! warps an iostream stream */
+ class StdStream : public Stream<int>
+ {
+ public:
+ StdStream (std::istream& cin, const std::string& name = "std::stream")
+ : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+ ~StdStream() {}
+ ParseLocation location() {
+ return ParseLocation(name,lineNumber,colNumber,charNumber);
+ }
+ int next() {
+ int c = cin.get();
+ if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+ charNumber++;
+ return c;
+ }
+ private:
+ std::istream& cin;
+ ssize_t lineNumber; /// the line number the token is from
+ ssize_t colNumber; /// the character number in the current line
+ ssize_t charNumber; /// the character in the file
+ std::shared_ptr<std::string> name; /// name of buffer
+ };
+
+ /*! creates a stream from a file */
+ class FileStream : public Stream<int>
+ {
+ public:
+
+ FileStream (FILE* file, const std::string& name = "file")
+ : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+ FileStream (const FileName& fileName)
+ : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+ {
+ file = fopen(fileName.c_str(),"r");
+ if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+ }
+ ~FileStream() { if (file) fclose(file); }
+
+ public:
+ ParseLocation location() {
+ return ParseLocation(name,lineNumber,colNumber,charNumber);
+ }
+
+ int next() {
+ int c = fgetc(file);
+ if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+ charNumber++;
+ return c;
+ }
+
+ private:
+ FILE* file;
+ ssize_t lineNumber; /// the line number the token is from
+ ssize_t colNumber; /// the character number in the current line
+ ssize_t charNumber; /// the character in the file
+ std::shared_ptr<std::string> name; /// name of buffer
+ };
+
+ /*! creates a stream from a string */
+ class StrStream : public Stream<int>
+ {
+ public:
+
+ StrStream (const char* str)
+ : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+ public:
+ ParseLocation location() {
+ return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+ }
+
+ int next() {
+ int c = str[charNumber];
+ if (c == 0) return EOF;
+ if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+ charNumber++;
+ return c;
+ }
+
+ private:
+ const char* str;
+ ssize_t lineNumber; /// the line number the token is from
+ ssize_t colNumber; /// the character number in the current line
+ ssize_t charNumber; /// the character in the file
+ };
+
+ /*! creates a character stream from a command line */
+ class CommandLineStream : public Stream<int>
+ {
+ public:
+ CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+ : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+ {
+ if (argc > 0) {
+ for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+ charNumber++;
+ }
+ for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+ }
+ ~CommandLineStream() {}
+ public:
+ ParseLocation location() {
+ return ParseLocation(name,0,charNumber,charNumber);
+ }
+ int next() {
+ if (i == args.size()) return EOF;
+ if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+ charNumber++;
+ return args[i][j++];
+ }
+ private:
+ size_t i,j;
+ std::vector<std::string> args;
+ ssize_t charNumber; /// the character in the file
+ std::shared_ptr<std::string> name; /// name of buffer
+ };
+}
diff --git a/thirdparty/embree/common/lexers/streamfilters.h b/thirdparty/embree/common/lexers/streamfilters.h
new file mode 100644
index 0000000000..3592b77b03
--- /dev/null
+++ b/thirdparty/embree/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+ /* removes all line comments from a stream */
+ class LineCommentFilter : public Stream<int>
+ {
+ public:
+ LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+ : cin(new FileStream(fileName)), lineComment(lineComment) {}
+ LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+ : cin(cin), lineComment(lineComment) {}
+
+ ParseLocation location() { return cin->loc(); }
+
+ int next()
+ {
+ /* look if the line comment starts here */
+ for (size_t j=0; j<lineComment.size(); j++) {
+ if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+ cin->get();
+ }
+ /* eat all characters until the end of the line (or file) */
+ while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+ not_found:
+ return cin->get();
+ }
+
+ private:
+ Ref<Stream<int> > cin;
+ std::string lineComment;
+ };
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
new file mode 100644
index 0000000000..a037869506
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -0,0 +1,51 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+ static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+ /* creates map for fast categorization of characters */
+ static void createCharMap(bool map[256], const std::string& chrs) {
+ for (size_t i=0; i<256; i++) map[i] = false;
+ for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+ }
+
+ /* simple tokenizer */
+ StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+ : cin(cin), endl(endl), multiLine(multiLine)
+ {
+ createCharMap(isSepMap,seps);
+ createCharMap(isValidCharMap,stringChars);
+ }
+
+ std::string StringStream::next()
+ {
+ /* skip separators */
+ while (cin->peek() != EOF) {
+ if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+ if (multiLine && cin->peek() == '\\') {
+ cin->drop();
+ if (cin->peek() == '\n') { cin->drop(); continue; }
+ cin->unget();
+ }
+ if (!isSeparator(cin->peek())) break;
+ cin->drop();
+ }
+
+ /* parse everything until the next separator */
+ std::vector<char> str; str.reserve(64);
+ while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+ int c = cin->get();
+ // -- GODOT start --
+ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+ if (!isValidChar(c)) abort();
+ // -- GODOT end --
+ str.push_back((char)c);
+ }
+ str.push_back(0);
+ return std::string(str.data());
+ }
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.h b/thirdparty/embree/common/lexers/stringstream.h
new file mode 100644
index 0000000000..6d9c27e3cd
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+ /*! simple tokenizer that produces a string stream */
+ class StringStream : public Stream<std::string>
+ {
+ public:
+ StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+ const std::string& endl = "", bool multiLine = false);
+ public:
+ ParseLocation location() { return cin->loc(); }
+ std::string next();
+ private:
+ __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+ __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+ private:
+ Ref<Stream<int> > cin; /*! source character stream */
+ bool isSepMap[256]; /*! map for fast classification of separators */
+ bool isValidCharMap[256]; /*! map for valid characters */
+ std::string endl; /*! the token of the end of line */
+ bool multiLine; /*! whether to parse lines wrapped with \ */
+ };
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp
new file mode 100644
index 0000000000..6ed6f2045a
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+ /* shorthands for common sets of characters */
+ const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+ const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ const std::string TokenStream::numbers = "0123456789";
+ const std::string TokenStream::separators = "\n\t\r ";
+ const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+ /* creates map for fast categorization of characters */
+ static void createCharMap(bool map[256], const std::string& chrs) {
+ for (size_t i=0; i<256; i++) map[i] = false;
+ for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+ }
+
+ /* build full tokenizer that takes list of valid characters and keywords */
+ TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from
+ const std::string& alpha, //< valid characters for identifiers
+ const std::string& seps, //< characters that act as separators
+ const std::vector<std::string>& symbols) //< symbols
+ : cin(cin), symbols(symbols)
+ {
+ createCharMap(isAlphaMap,alpha);
+ createCharMap(isSepMap,seps);
+ createCharMap(isStringCharMap,stringChars);
+ }
+
+ bool TokenStream::decDigits(std::string& str_o)
+ {
+ bool ok = false;
+ std::string str;
+ if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+ while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+ if (ok) str_o += str;
+ else cin->unget(str.size());
+ return ok;
+ }
+
+ bool TokenStream::decDigits1(std::string& str_o)
+ {
+ bool ok = false;
+ std::string str;
+ while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+ if (ok) str_o += str; else cin->unget(str.size());
+ return ok;
+ }
+
+ bool TokenStream::trySymbol(const std::string& symbol)
+ {
+ size_t pos = 0;
+ while (pos < symbol.size()) {
+ if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+ cin->drop(); pos++;
+ }
+ return true;
+ }
+
+ bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+ {
+ for (size_t i=0; i<symbols.size(); i++) {
+ if (!trySymbol(symbols[i])) continue;
+ token = Token(symbols[i],Token::TY_SYMBOL,loc);
+ return true;
+ }
+ return false;
+ }
+
+ bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+ {
+ bool ok = false;
+ std::string str;
+ if (trySymbol("nan")) {
+ token = Token(float(nan));
+ return true;
+ }
+ if (trySymbol("+inf")) {
+ token = Token(float(pos_inf));
+ return true;
+ }
+ if (trySymbol("-inf")) {
+ token = Token(float(neg_inf));
+ return true;
+ }
+
+ if (decDigits(str))
+ {
+ if (cin->peek() == '.') {
+ str += (char)cin->get();
+ decDigits(str);
+ if (cin->peek() == 'e' || cin->peek() == 'E') {
+ str += (char)cin->get();
+ if (decDigits(str)) ok = true; // 1.[2]E2
+ }
+ else ok = true; // 1.[2]
+ }
+ else if (cin->peek() == 'e' || cin->peek() == 'E') {
+ str += (char)cin->get();
+ if (decDigits(str)) ok = true; // 1E2
+ }
+ }
+ else
+ {
+ if (cin->peek() == '.') {
+ str += (char)cin->get();
+ if (decDigits(str)) {
+ if (cin->peek() == 'e' || cin->peek() == 'E') {
+ str += (char)cin->get();
+ if (decDigits(str)) ok = true; // .3E2
+ }
+ else ok = true; // .3
+ }
+ }
+ }
+ if (ok) {
+ token = Token((float)atof(str.c_str()),loc);
+ }
+ else cin->unget(str.size());
+ return ok;
+ }
+
+ bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+ std::string str;
+ if (decDigits(str)) {
+ token = Token(atoi(str.c_str()),loc);
+ return true;
+ }
+ return false;
+ }
+
+ bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+ {
+ std::string str;
+ if (cin->peek() != '\"') return false;
+ cin->drop();
+ while (cin->peek() != '\"') {
+ const int c = cin->get();
+ if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+ str += (char)c;
+ }
+ cin->drop();
+ token = Token(str,Token::TY_STRING,loc);
+ return true;
+ }
+
+ bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+ {
+ std::string str;
+ if (!isAlpha(cin->peek())) return false;
+ str += (char)cin->get();
+ while (isAlphaNum(cin->peek())) str += (char)cin->get();
+ token = Token(str,Token::TY_IDENTIFIER,loc);
+ return true;
+ }
+
+ void TokenStream::skipSeparators()
+ {
+ /* skip separators */
+ while (cin->peek() != EOF && isSeparator(cin->peek()))
+ cin->drop();
+ }
+
+ Token TokenStream::next()
+ {
+ Token token;
+ skipSeparators();
+ ParseLocation loc = cin->loc();
+ if (trySymbols (token,loc)) return token; /**< try to parse a symbol */
+ if (tryFloat (token,loc)) return token; /**< try to parse float */
+ if (tryInt (token,loc)) return token; /**< try to parse integer */
+ if (tryString (token,loc)) return token; /**< try to parse string */
+ if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */
+ if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */
+ return Token((char)cin->get(),loc); /**< return invalid character token */
+ }
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.h b/thirdparty/embree/common/lexers/tokenstream.h
new file mode 100644
index 0000000000..6e49dd0b39
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+ /*! token class */
+ class Token
+ {
+ public:
+
+ enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+ Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {}
+ Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+ Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {}
+ Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+ Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {}
+
+ static Token Eof() { return Token(); }
+ static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+ static Token Str(std::string str) { return Token(str,TY_STRING); }
+ static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+ char Char() const {
+ if (ty == TY_CHAR) return c;
+ THROW_RUNTIME_ERROR(loc.str()+": character expected");
+ }
+
+ int Int() const {
+ if (ty == TY_INT) return i;
+ THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+ }
+
+ float Float(bool cast = true) const {
+ if (ty == TY_FLOAT) return f;
+ if (ty == TY_INT && cast) return (float)i;
+ THROW_RUNTIME_ERROR(loc.str()+": float expected");
+ }
+
+ std::string Identifier() const {
+ if (ty == TY_IDENTIFIER) return str;
+ THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+ }
+
+ std::string String() const {
+ if (ty == TY_STRING) return str;
+ THROW_RUNTIME_ERROR(loc.str()+": string expected");
+ }
+
+ std::string Symbol() const {
+ if (ty == TY_SYMBOL) return str;
+ THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+ }
+
+ const ParseLocation& Location() const { return loc; }
+
+ friend bool operator==(const Token& a, const Token& b)
+ {
+ if (a.ty != b.ty) return false;
+ if (a.ty == TY_CHAR) return a.c == b.c;
+ if (a.ty == TY_INT) return a.i == b.i;
+ if (a.ty == TY_FLOAT) return a.f == b.f;
+ if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+ if (a.ty == TY_STRING) return a.str == b.str;
+ if (a.ty == TY_SYMBOL) return a.str == b.str;
+ return true;
+ }
+
+ friend bool operator!=(const Token& a, const Token& b) {
+ return !(a == b);
+ }
+
+ friend bool operator <( const Token& a, const Token& b ) {
+ if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+ if (a.ty == TY_CHAR) return a.c < b.c;
+ if (a.ty == TY_INT) return a.i < b.i;
+ if (a.ty == TY_FLOAT) return a.f < b.f;
+ if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+ if (a.ty == TY_STRING) return a.str < b.str;
+ if (a.ty == TY_SYMBOL) return a.str < b.str;
+ return false;
+ }
+
+ friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+ {
+ if (t.ty == TY_EOF) return cout << "eof";
+ if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+ if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+ if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+ if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+ if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+ if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+ return cout << "unknown";
+ }
+
+ private:
+ Type ty; //< the type of the token
+ union {
+ char c; //< data for char tokens
+ int i; //< data for int tokens
+ float f; //< data for float tokens
+ };
+ std::string str; //< data for string and identifier tokens
+ ParseLocation loc; //< the location the token is from
+ };
+
+ /*! build full tokenizer that takes list of valid characters and keywords */
+ class TokenStream : public Stream<Token>
+ {
+ public:
+
+ /*! shorthands for common sets of characters */
+ static const std::string alpha;
+ static const std::string ALPHA;
+ static const std::string numbers;
+ static const std::string separators;
+ static const std::string stringChars;
+
+ public:
+ TokenStream(const Ref<Stream<int> >& cin,
+ const std::string& alpha, //< valid characters for identifiers
+ const std::string& seps, //< characters that act as separators
+ const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+ public:
+ ParseLocation location() { return cin->loc(); }
+ Token next();
+ bool trySymbol(const std::string& symbol);
+
+ private:
+ void skipSeparators();
+ bool decDigits(std::string& str);
+ bool decDigits1(std::string& str);
+ bool trySymbols(Token& token, const ParseLocation& loc);
+ bool tryFloat(Token& token, const ParseLocation& loc);
+ bool tryInt(Token& token, const ParseLocation& loc);
+ bool tryString(Token& token, const ParseLocation& loc);
+ bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+ Ref<Stream<int> > cin;
+ bool isSepMap[256];
+ bool isAlphaMap[256];
+ bool isStringCharMap[256];
+ std::vector<std::string> symbols;
+
+ /*! checks if a character is a separator */
+ __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+ /*! checks if a character is a number */
+ __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; }
+
+ /*! checks if a character is valid inside a string */
+ __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+ /*! checks if a character is legal for an identifier */
+ __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; }
+ __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+ };
+}
diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h
new file mode 100644
index 0000000000..9d4a0f0846
--- /dev/null
+++ b/thirdparty/embree/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+ #define VectorT typename L::Vector
+ #define ScalarT typename L::Vector::Scalar
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Affine Space
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L>
+ struct AffineSpaceT
+ {
+ L l; /*< linear part of affine space */
+ VectorT p; /*< affine part of affine space */
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Constructors, Assignment, Cast, Copy Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline AffineSpaceT ( ) { }
+ __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+ __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); }
+ __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+ __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+ __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+ template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+ __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {}
+
+ /*! return matrix for scaling */
+ static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+ /*! return matrix for translation */
+ static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+ /*! return matrix for rotation, only in 2D */
+ static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+ /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+ static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+ /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+ static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); }
+
+ /*! return matrix for looking at given point, only in 3D */
+ static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+ VectorT Z = normalize(point-eye);
+ VectorT U = normalize(cross(up,Z));
+ VectorT V = normalize(cross(Z,U));
+ return AffineSpaceT(L(U,V,Z),eye);
+ }
+
+ };
+
+ // template specialization to get correct identity matrix for type AffineSpace3fa
+ template<>
+ __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+ template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+ template<typename L> __forceinline AffineSpaceT<L> rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+ template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+ template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+ template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+ template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+ template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT & b ) { return a * rcp(b); }
+
+ template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+ template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a * b; }
+ template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+ template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a / b; }
+
+ template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+ template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+ template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+ __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b)
+ {
+ BBox3fa dst = empty;
+ const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+ const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+ const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+ const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+ const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+ const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+ const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+ const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+ return dst;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+ template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+ return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+ return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Template Instantiations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+ typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+ typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+ typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+ typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+ typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+ template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+ typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>> AffineSpace3vf4;
+ typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>> AffineSpace3vf8;
+ typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+ template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+ typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>> AffineSpace3vfa4;
+ typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>> AffineSpace3vfa8;
+ typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+ //////////////////////////////////////////////////////////////////////////////
+ /// Interpolation
+ //////////////////////////////////////////////////////////////////////////////
+ template<typename T, typename R>
+ __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+ const AffineSpaceT<T>& M1,
+ const R& t)
+ {
+ return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+ }
+
+ // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+ // three matrizes (D, R, S) that are interpolated individually.
+ template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+ slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+ const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+ const T& t)
+ {
+ QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+ QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+ QuaternionT<T> q = slerp(q0, q1, t);
+
+ AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+ AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+ D.p.x = S.l.vx.y;
+ D.p.y = S.l.vx.z;
+ D.p.z = S.l.vy.z;
+ S.l.vx.y = 0;
+ S.l.vx.z = 0;
+ S.l.vy.z = 0;
+
+ AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+ return D * R * S;
+ }
+
+ // this is a specialized version for Vec3fa because that does
+ // not play along nicely with the other templated Vec3/Vec4 types
+ __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+ const AffineSpace3ff& M1,
+ const float& t)
+ {
+ Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+ Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+ Quaternion3f q = slerp(q0, q1, t);
+
+ AffineSpace3fa S = lerp(M0, M1, t);
+ AffineSpace3fa D(one);
+ D.p.x = S.l.vx.y;
+ D.p.y = S.l.vx.z;
+ D.p.z = S.l.vy.z;
+ S.l.vx.y = 0;
+ S.l.vx.z = 0;
+ S.l.vy.z = 0;
+
+ AffineSpace3fa R = LinearSpace3fa(q);
+ return D * R * S;
+ }
+
+ __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+ {
+ // compute affine transform from quaternion decomposition
+ Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+ AffineSpace3fa M = qd;
+ AffineSpace3fa D(one);
+ D.p.x = M.l.vx.y;
+ D.p.y = M.l.vx.z;
+ D.p.z = M.l.vy.z;
+ M.l.vx.y = 0;
+ M.l.vx.z = 0;
+ M.l.vy.z = 0;
+ AffineSpace3fa R = LinearSpace3fa(q);
+ return D * R * M;
+ }
+
+ __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+ {
+ q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+ S = qd;
+ T.x = qd.l.vx.y;
+ T.y = qd.l.vx.z;
+ T.z = qd.l.vy.z;
+ S.l.vx.y = 0;
+ S.l.vx.z = 0;
+ S.l.vy.z = 0;
+ }
+
+ __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+ {
+ AffineSpace3ff M = S;
+ M.l.vx.w = q.i;
+ M.l.vy.w = q.j;
+ M.l.vz.w = q.k;
+ M.p.w = q.r;
+ M.l.vx.y = T.x;
+ M.l.vx.z = T.y;
+ M.l.vy.z = T.z;
+ return M;
+ }
+
+ struct __aligned(16) QuaternionDecomposition
+ {
+ float scale_x = 1.f;
+ float scale_y = 1.f;
+ float scale_z = 1.f;
+ float skew_xy = 0.f;
+ float skew_xz = 0.f;
+ float skew_yz = 0.f;
+ float shift_x = 0.f;
+ float shift_y = 0.f;
+ float shift_z = 0.f;
+ float quaternion_r = 1.f;
+ float quaternion_i = 0.f;
+ float quaternion_j = 0.f;
+ float quaternion_k = 0.f;
+ float translation_x = 0.f;
+ float translation_y = 0.f;
+ float translation_z = 0.f;
+ };
+
+ __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+ {
+ QuaternionDecomposition qd;
+ qd.scale_x = M.l.vx.x;
+ qd.scale_y = M.l.vy.y;
+ qd.scale_z = M.l.vz.z;
+ qd.shift_x = M.p.x;
+ qd.shift_y = M.p.y;
+ qd.shift_z = M.p.z;
+ qd.translation_x = M.l.vx.y;
+ qd.translation_y = M.l.vx.z;
+ qd.translation_z = M.l.vy.z;
+ qd.skew_xy = M.l.vy.x;
+ qd.skew_xz = M.l.vz.x;
+ qd.skew_yz = M.l.vz.y;
+ qd.quaternion_r = M.p.w;
+ qd.quaternion_i = M.l.vx.w;
+ qd.quaternion_j = M.l.vy.w;
+ qd.quaternion_k = M.l.vz.w;
+ return qd;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /*
+ * ! Template Specialization for 2D: return matrix for rotation around point
+ * (rotation around arbitrarty vector is not meaningful in 2D)
+ */
+ template<> __forceinline
+ AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+ return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Similarity Transform
+ //
+ // checks, if M is a similarity transformation, i.e if there exists a factor D
+ // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+ ////////////////////////////////////////////////////////////////////////////////
+ __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+ {
+ if (D) *D = 0.f;
+ if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+ if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+ if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+ const float D_x = dot(M.l.vx, M.l.vx);
+ const float D_y = dot(M.l.vy, M.l.vy);
+ const float D_z = dot(M.l.vz, M.l.vz);
+
+ if (abs(D_x - D_y) > 1e-5f ||
+ abs(D_x - D_z) > 1e-5f ||
+ abs(D_y - D_z) > 1e-5f)
+ return false;
+
+ if (D) *D = sqrtf(D_x);
+ return true;
+ }
+
+ __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+ {
+ Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+ Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+ Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+ Vec3fa::storeu(&ptr->p, source.p);
+ }
+
+ __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+ {
+ AffineSpace3fa space;
+ space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+ space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+ space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+ space.p = Vec3fa::loadu(&ptr->p);
+ return space;
+ }
+
+ #undef VectorT
+ #undef ScalarT
+}
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
new file mode 100644
index 0000000000..bc43155358
--- /dev/null
+++ b/thirdparty/embree/common/math/bbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+ namespace internal {
+
+ template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+ template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+ template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+ } // namespace internal
+ template<typename T>
+ struct BBox
+ {
+ T lower, upper;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline BBox ( ) { }
+ template<typename T1>
+ __forceinline BBox ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+ __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+ __forceinline BBox ( const T& v ) : lower(v), upper(v) {}
+ __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Extending Bounds
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+ __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; }
+
+ /*! tests if box is empty */
+ __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+ /*! computes the size of the box */
+ __forceinline T size() const { return upper - lower; }
+
+ /*! computes the center of the box */
+ __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+ /*! computes twice the center of the box */
+ __forceinline T center2() const { return lower+upper; }
+
+ /*! merges two boxes */
+ __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+ return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+ }
+
+ /*! enlarge box by some scaling factor */
+ __forceinline BBox enlarge_by(const float a) const {
+ return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+ __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {}
+ __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+ __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {}
+ __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+ __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+ };
+
+ template<> __forceinline bool BBox<float>::empty() const {
+ return lower > upper;
+ }
+
+#if defined(__SSE__)
+ template<> __forceinline bool BBox<Vec3fa>::empty() const {
+ return !all(le_mask(lower,upper));
+ }
+ template<> __forceinline bool BBox<Vec3fx>::empty() const {
+ return !all(le_mask(lower,upper));
+ }
+#endif
+
+ /*! tests if box is finite */
+ __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+ return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+ }
+
+ /*! tests if box is finite and non-empty*/
+ __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+ return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+ }
+
+ /*! tests if box has finite entries */
+ __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+ return is_finite(b.lower) && is_finite(b.upper);
+ }
+
+ /*! test if point contained in box */
+ __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+ /*! computes the center of the box */
+ template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+ template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+ /*! computes the volume of a bounding box */
+ __forceinline float volume ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+ __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+ /*! computes the volume of a bounding box */
+ __forceinline float volume( const BBox<Vec3f>& b ) { return reduce_mul(b.size()); }
+
+ /*! computes the surface area of a bounding box */
+ template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+ template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+ template<typename T> __forceinline const T area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+ __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+ __forceinline float area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+ __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+ __forceinline float area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+ template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+ template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+ return halfArea(box);
+ }
+
+ /*! merges bounding boxes and points */
+ template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const T& b ) { return BBox<T>(min(a.lower, b ), max(a.upper, b )); }
+ template<typename T> __forceinline const BBox<T> merge( const T& a, const BBox<T>& b ) { return BBox<T>(min(a , b.lower), max(a , b.upper)); }
+ template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+ /*! Merges three boxes. */
+ template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+ /*! Merges four boxes. */
+ template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+ return merge(merge(a,b),merge(c,d));
+ }
+
+ /*! Comparison Operators */
+ template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+ template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+ /*! scaling */
+ template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+ template<typename T> __forceinline BBox<T> operator *( const T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+ /*! translations */
+ template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+ template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+ template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower+b ,a.upper+b ); }
+ template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower-b ,a.upper-b ); }
+
+ /*! extension */
+ template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+ /*! intersect bounding boxes */
+ template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+ template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+ template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+ /*! subtract bounds from each other */
+ template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+ {
+ c.lower = a.lower;
+ c.upper = min(a.upper,b.lower);
+ d.lower = max(a.lower,b.upper);
+ d.upper = a.upper;
+ }
+
+ /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+ template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+ template<typename T> __inline bool disjoint( const BBox<T>& a, const T& b ) { return disjoint(a,BBox<T>(b)); }
+ template<typename T> __inline bool disjoint( const T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+ /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+ template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+ template<typename T> __inline bool conjoint( const BBox<T>& a, const T& b ) { return conjoint(a,BBox<T>(b)); }
+ template<typename T> __inline bool conjoint( const T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+ /*! subset relation */
+ template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+ {
+ for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+ for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+ return true;
+ }
+
+ template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+ return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+ }
+
+ template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+ return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+ }
+
+ /*! blending */
+ template<typename T>
+ __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+ return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+ }
+
+ /*! output operator */
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+ return cout << "[" << box.lower << "; " << box.upper << "]";
+ }
+
+ /*! default template instantiations */
+ typedef BBox<float> BBox1f;
+ typedef BBox<Vec2f> BBox2f;
+ typedef BBox<Vec2fa> BBox2fa;
+ typedef BBox<Vec3f> BBox3f;
+ typedef BBox<Vec3fa> BBox3fa;
+ typedef BBox<Vec3fx> BBox3fx;
+ typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+ template<int N>
+ __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+
+ template<>
+ __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+ {
+ BBox<Vec3<vfloat4>> dest;
+
+ transpose((vfloat4&)bounds[0].lower,
+ (vfloat4&)bounds[1].lower,
+ (vfloat4&)bounds[2].lower,
+ (vfloat4&)bounds[3].lower,
+ dest.lower.x,
+ dest.lower.y,
+ dest.lower.z);
+
+ transpose((vfloat4&)bounds[0].upper,
+ (vfloat4&)bounds[1].upper,
+ (vfloat4&)bounds[2].upper,
+ (vfloat4&)bounds[3].upper,
+ dest.upper.x,
+ dest.upper.y,
+ dest.upper.z);
+
+ return dest;
+ }
+
+#if defined(__AVX__)
+ template<>
+ __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+ {
+ BBox<Vec3<vfloat8>> dest;
+
+ transpose((vfloat4&)bounds[0].lower,
+ (vfloat4&)bounds[1].lower,
+ (vfloat4&)bounds[2].lower,
+ (vfloat4&)bounds[3].lower,
+ (vfloat4&)bounds[4].lower,
+ (vfloat4&)bounds[5].lower,
+ (vfloat4&)bounds[6].lower,
+ (vfloat4&)bounds[7].lower,
+ dest.lower.x,
+ dest.lower.y,
+ dest.lower.z);
+
+ transpose((vfloat4&)bounds[0].upper,
+ (vfloat4&)bounds[1].upper,
+ (vfloat4&)bounds[2].upper,
+ (vfloat4&)bounds[3].upper,
+ (vfloat4&)bounds[4].upper,
+ (vfloat4&)bounds[5].upper,
+ (vfloat4&)bounds[6].upper,
+ (vfloat4&)bounds[7].upper,
+ dest.upper.x,
+ dest.upper.y,
+ dest.upper.z);
+
+ return dest;
+ }
+#endif
+
+ template<int N>
+ __forceinline BBox3fa merge(const BBox3fa* bounds);
+
+ template<>
+ __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+ {
+ const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+ min(bounds[2].lower,bounds[3].lower));
+ const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+ max(bounds[2].upper,bounds[3].upper));
+ return BBox3fa(lower,upper);
+ }
+
+#if defined(__AVX__)
+ template<>
+ __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+ {
+ const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+ min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+ const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+ max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+ return BBox3fa(lower,upper);
+ }
+#endif
+}
+
diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h
new file mode 100644
index 0000000000..3f50c04393
--- /dev/null
+++ b/thirdparty/embree/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// RGB Color Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct Col3
+ {
+ T r, g, b;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Col3 ( ) { }
+ __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+ __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+ __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {}
+ __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {}
+ __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {}
+ __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+ __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+ };
+
+ /*! output operator */
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+ return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+ }
+
+ /*! default template instantiations */
+ typedef Col3<unsigned char> Col3uc;
+ typedef Col3<float > Col3f;
+}
diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h
new file mode 100644
index 0000000000..788508516b
--- /dev/null
+++ b/thirdparty/embree/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// RGBA Color Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct Col4
+ {
+ T r, g, b, a;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Col4 ( ) { }
+ __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+ __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+ __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+ __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {}
+ __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {}
+ __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+ __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+ };
+
+ /*! output operator */
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+ return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+ }
+
+ /*! default template instantiations */
+ typedef Col4<unsigned char> Col4uc;
+ typedef Col4<float > Col4f;
+}
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
new file mode 100644
index 0000000000..529584ea16
--- /dev/null
+++ b/thirdparty/embree/common/math/color.h
@@ -0,0 +1,241 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE RGBA Color Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct Color4
+ {
+ union {
+ __m128 m128;
+ struct { float r,g,b,a; };
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Color4 () {}
+ __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+ __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+ __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+ __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+ __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+ __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+ __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+ __forceinline Color4 ( const Color4& other ) : m128(other.m128) {}
+ __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+ __forceinline operator const __m128&() const { return m128; }
+ __forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Set
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+ __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+ __forceinline void set(Col3uc& d) const
+ {
+ vfloat4 s = clamp(vfloat4(m128))*255.0f;
+ d.r = (unsigned char)(s[0]);
+ d.g = (unsigned char)(s[1]);
+ d.b = (unsigned char)(s[2]);
+ }
+ __forceinline void set(Col4uc& d) const
+ {
+ vfloat4 s = clamp(vfloat4(m128))*255.0f;
+ d.r = (unsigned char)(s[0]);
+ d.g = (unsigned char)(s[1]);
+ d.b = (unsigned char)(s[2]);
+ d.a = (unsigned char)(s[3]);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {}
+ __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
+ __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+ __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE RGB Color Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct Color
+ {
+ union {
+ __m128 m128;
+ struct { float r,g,b; };
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Color () {}
+ __forceinline Color ( const __m128 a ) : m128(a) {}
+
+ __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {}
+ __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+ __forceinline Color ( const Color& other ) : m128(other.m128) {}
+ __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+ __forceinline Color ( const Color4& other ) : m128(other.m128) {}
+ __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+ __forceinline operator const __m128&() const { return m128; }
+ __forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Set
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+ __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+ __forceinline void set(Col3uc& d) const
+ {
+ vfloat4 s = clamp(vfloat4(m128))*255.0f;
+ d.r = (unsigned char)(s[0]);
+ d.g = (unsigned char)(s[1]);
+ d.b = (unsigned char)(s[2]);
+ }
+ __forceinline void set(Col4uc& d) const
+ {
+ vfloat4 s = clamp(vfloat4(m128))*255.0f;
+ d.r = (unsigned char)(s[0]);
+ d.g = (unsigned char)(s[1]);
+ d.b = (unsigned char)(s[2]);
+ d.a = 255;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {}
+ __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
+ __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+ __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const Color operator +( const Color& a ) { return a; }
+ __forceinline const Color operator -( const Color& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return _mm_xor_ps(a.m128, mask);
+ }
+ __forceinline const Color abs ( const Color& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return _mm_and_ps(a.m128, mask);
+ }
+ __forceinline const Color rcp ( const Color& a )
+ {
+#if defined(__AVX512VL__)
+ const Color r = _mm_rcp14_ps(a.m128);
+#else
+ const Color r = _mm_rcp_ps(a.m128);
+#endif
+ return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ }
+ __forceinline const Color rsqrt( const Color& a )
+ {
+#if defined(__AVX512VL__)
+ __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+ __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+ return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+ }
+ __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+ __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+ __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+ __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); }
+ __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; }
+ __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+ __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); }
+
+ __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+ __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+ __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+ __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+ __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+ __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; }
+ __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+ __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+ __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+ __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+ __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+ __forceinline bool operator < ( const Color& a, const Color& b ) {
+ if (a.r != b.r) return a.r < b.r;
+ if (a.g != b.g) return a.g < b.g;
+ if (a.b != b.b) return a.b < b.b;
+ return false;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+ __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+ return blendv_ps(f, t, mask);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Special Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ /*! computes luminance of a color */
+ __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+ /*! output operator */
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+ return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+ }
+}
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
new file mode 100644
index 0000000000..03919ae20c
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -0,0 +1,27 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "constants.h"
+
+namespace embree
+{
+ TrueTy True;
+ FalseTy False;
+ ZeroTy zero;
+ OneTy one;
+ NegInfTy neg_inf;
+ PosInfTy inf;
+ PosInfTy pos_inf;
+ NaNTy nan;
+ UlpTy ulp;
+ PiTy pi;
+ OneOverPiTy one_over_pi;
+ TwoPiTy two_pi;
+ OneOverTwoPiTy one_over_two_pi;
+ FourPiTy four_pi;
+ OneOverFourPiTy one_over_four_pi;
+ StepTy step;
+ ReverseStepTy reverse_step;
+ EmptyTy empty;
+ UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
new file mode 100644
index 0000000000..578473a8ab
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+namespace embree
+{
+ static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+ static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+ /* we consider floating point numbers in that range as valid input numbers */
+ static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+ struct TrueTy {
+ __forceinline operator bool( ) const { return true; }
+ };
+
+ extern MAYBE_UNUSED TrueTy True;
+
+ struct FalseTy {
+ __forceinline operator bool( ) const { return false; }
+ };
+
+ extern MAYBE_UNUSED FalseTy False;
+
+ struct ZeroTy
+ {
+ __forceinline operator double ( ) const { return 0; }
+ __forceinline operator float ( ) const { return 0; }
+ __forceinline operator long long( ) const { return 0; }
+ __forceinline operator unsigned long long( ) const { return 0; }
+ __forceinline operator long ( ) const { return 0; }
+ __forceinline operator unsigned long ( ) const { return 0; }
+ __forceinline operator int ( ) const { return 0; }
+ __forceinline operator unsigned int ( ) const { return 0; }
+ __forceinline operator short ( ) const { return 0; }
+ __forceinline operator unsigned short ( ) const { return 0; }
+ __forceinline operator char ( ) const { return 0; }
+ __forceinline operator unsigned char ( ) const { return 0; }
+ };
+
+ extern MAYBE_UNUSED ZeroTy zero;
+
+ struct OneTy
+ {
+ __forceinline operator double ( ) const { return 1; }
+ __forceinline operator float ( ) const { return 1; }
+ __forceinline operator long long( ) const { return 1; }
+ __forceinline operator unsigned long long( ) const { return 1; }
+ __forceinline operator long ( ) const { return 1; }
+ __forceinline operator unsigned long ( ) const { return 1; }
+ __forceinline operator int ( ) const { return 1; }
+ __forceinline operator unsigned int ( ) const { return 1; }
+ __forceinline operator short ( ) const { return 1; }
+ __forceinline operator unsigned short ( ) const { return 1; }
+ __forceinline operator char ( ) const { return 1; }
+ __forceinline operator unsigned char ( ) const { return 1; }
+ };
+
+ extern MAYBE_UNUSED OneTy one;
+
+ struct NegInfTy
+ {
+ __forceinline operator double ( ) const { return -std::numeric_limits<double>::infinity(); }
+ __forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); }
+ __forceinline operator long long( ) const { return std::numeric_limits<long long>::min(); }
+ __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+ __forceinline operator long ( ) const { return std::numeric_limits<long>::min(); }
+ __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::min(); }
+ __forceinline operator int ( ) const { return std::numeric_limits<int>::min(); }
+ __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); }
+ __forceinline operator short ( ) const { return std::numeric_limits<short>::min(); }
+ __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); }
+ __forceinline operator char ( ) const { return std::numeric_limits<char>::min(); }
+ __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::min(); }
+
+ };
+
+ extern MAYBE_UNUSED NegInfTy neg_inf;
+
+ struct PosInfTy
+ {
+ __forceinline operator double ( ) const { return std::numeric_limits<double>::infinity(); }
+ __forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); }
+ __forceinline operator long long( ) const { return std::numeric_limits<long long>::max(); }
+ __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+ __forceinline operator long ( ) const { return std::numeric_limits<long>::max(); }
+ __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::max(); }
+ __forceinline operator int ( ) const { return std::numeric_limits<int>::max(); }
+ __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); }
+ __forceinline operator short ( ) const { return std::numeric_limits<short>::max(); }
+ __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); }
+ __forceinline operator char ( ) const { return std::numeric_limits<char>::max(); }
+ __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); }
+ };
+
+ extern MAYBE_UNUSED PosInfTy inf;
+ extern MAYBE_UNUSED PosInfTy pos_inf;
+
+ struct NaNTy
+ {
+ __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+ __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+ };
+
+ extern MAYBE_UNUSED NaNTy nan;
+
+ struct UlpTy
+ {
+ __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+ __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+ };
+
+ extern MAYBE_UNUSED UlpTy ulp;
+
+ struct PiTy
+ {
+ __forceinline operator double( ) const { return double(M_PI); }
+ __forceinline operator float ( ) const { return float(M_PI); }
+ };
+
+ extern MAYBE_UNUSED PiTy pi;
+
+ struct OneOverPiTy
+ {
+ __forceinline operator double( ) const { return double(M_1_PI); }
+ __forceinline operator float ( ) const { return float(M_1_PI); }
+ };
+
+ extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+ struct TwoPiTy
+ {
+ __forceinline operator double( ) const { return double(2.0*M_PI); }
+ __forceinline operator float ( ) const { return float(2.0*M_PI); }
+ };
+
+ extern MAYBE_UNUSED TwoPiTy two_pi;
+
+ struct OneOverTwoPiTy
+ {
+ __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+ __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+ };
+
+ extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+ struct FourPiTy
+ {
+ __forceinline operator double( ) const { return double(4.0*M_PI); }
+ __forceinline operator float ( ) const { return float(4.0*M_PI); }
+ };
+
+ extern MAYBE_UNUSED FourPiTy four_pi;
+
+ struct OneOverFourPiTy
+ {
+ __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+ __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+ };
+
+ extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+ struct StepTy {
+ };
+
+ extern MAYBE_UNUSED StepTy step;
+
+ struct ReverseStepTy {
+ };
+
+ extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+ struct EmptyTy {
+ };
+
+ extern MAYBE_UNUSED EmptyTy empty;
+
+ struct FullTy {
+ };
+
+ extern MAYBE_UNUSED FullTy full;
+
+ struct UndefinedTy {
+ };
+
+ extern MAYBE_UNUSED UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/interval.h b/thirdparty/embree/common/math/interval.h
new file mode 100644
index 0000000000..310add2129
--- /dev/null
+++ b/thirdparty/embree/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+ template<typename V>
+ struct Interval
+ {
+ V lower, upper;
+
+ __forceinline Interval() {}
+ __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+ __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+ __forceinline Interval(const V& a) : lower(a), upper(a) {}
+ __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+ __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+
+ /*! tests if box is empty */
+ //__forceinline bool empty() const { return lower > upper; }
+
+ /*! computes the size of the interval */
+ __forceinline V size() const { return upper - lower; }
+
+ __forceinline V center() const { return 0.5f*(lower+upper); }
+
+ __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+ __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; }
+
+ __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+ return Interval(a.lower+b.lower,a.upper+b.upper);
+ }
+
+ __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+ return Interval(a.lower-b.upper,a.upper-b.lower);
+ }
+
+ __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+ return Interval(a.lower-b,a.upper-b);
+ }
+
+ __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+ {
+ const V ll = a.lower*b.lower;
+ const V lu = a.lower*b.upper;
+ const V ul = a.upper*b.lower;
+ const V uu = a.upper*b.upper;
+ return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+ }
+
+ __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+ return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+ }
+
+ __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+ return merge(merge(a,b),c);
+ }
+
+ __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+ return merge(merge(a,b),merge(c,d));
+ }
+
+ /*! intersect bounding boxes */
+ __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+ __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+ __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+ friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+ return cout << "[" << a.lower << ", " << a.upper << "]";
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+ __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {}
+ };
+
+ __forceinline bool isEmpty(const Interval<float>& v) {
+ return v.lower > v.upper;
+ }
+
+ __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+ return v.lower > v.upper;
+ }
+
+ /*! subset relation */
+ template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) {
+ return (a.lower > b.lower) && (a.upper < b.upper);
+ }
+
+ template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+ return subset(a.x,b.x) && subset(a.y,b.y);
+ }
+
+ template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+ return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+ return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+ }
+
+ template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+ return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+ }
+
+ __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+ {
+ float eps = 1E-4f;
+ bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+ bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+ return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+ }
+
+ typedef Interval<float> Interval1f;
+ typedef Vec2<Interval<float>> Interval2f;
+ typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+ if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+ if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+ if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+ float sinLower = sin(interval.lower);
+ float sinUpper = sin(interval.upper);
+ if (sinLower > sinUpper) swap(sinLower, sinUpper);
+ if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0;
+ if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+ return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+ if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+ if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+ if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+ float cosLower = cos(interval.lower);
+ float cosUpper = cos(interval.upper);
+ if (cosLower > cosUpper) swap(cosLower, cosUpper);
+ if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+ return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h
new file mode 100644
index 0000000000..2b397a05c8
--- /dev/null
+++ b/thirdparty/embree/common/math/lbbox.h
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+ template<typename T>
+ __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+ {
+ const float rcp_dt_size = float(1.0f)/dt.size();
+ const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+ const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+ return std::make_pair(g0,g1);
+ }
+
+ template<typename T>
+ struct LBBox
+ {
+ public:
+ __forceinline LBBox () {}
+
+ template<typename T1>
+ __forceinline LBBox ( const LBBox<T1>& other )
+ : bounds0(other.bounds0), bounds1(other.bounds1) {}
+
+ __forceinline LBBox& operator= ( const LBBox& other ) {
+ bounds0 = other.bounds0; bounds1 = other.bounds1; return *this;
+ }
+
+ __forceinline LBBox (EmptyTy)
+ : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+
+ __forceinline explicit LBBox ( const BBox<T>& bounds)
+ : bounds0(bounds), bounds1(bounds) { }
+
+ __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1)
+ : bounds0(bounds0), bounds1(bounds1) { }
+
+ LBBox ( const avector<BBox<T>>& bounds )
+ {
+ assert(bounds.size());
+ BBox<T> b0 = bounds.front();
+ BBox<T> b1 = bounds.back();
+ for (size_t i=1; i<bounds.size()-1; i++) {
+ const float f = float(i)/float(bounds.size()-1);
+ const BBox<T> bt = lerp(b0,b1,f);
+ const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+ const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+ b0.lower += dlower; b1.lower += dlower;
+ b0.upper += dupper; b1.upper += dupper;
+ }
+ bounds0 = b0;
+ bounds1 = b1;
+ }
+
+ /*! calculates the linear bounds of a primitive for the specified time range */
+ template<typename BoundsFunc>
+ __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+ {
+ const float lower = time_range.lower*numTimeSegments;
+ const float upper = time_range.upper*numTimeSegments;
+ const float ilowerf = floor(lower);
+ const float iupperf = ceil(upper);
+ const int ilower = (int)ilowerf;
+ const int iupper = (int)iupperf;
+
+ const BBox<T> blower0 = bounds(ilower);
+ const BBox<T> bupper1 = bounds(iupper);
+
+ if (iupper-ilower == 1) {
+ bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+ bounds1 = lerp(bupper1, blower0, iupperf-upper);
+ return;
+ }
+
+ const BBox<T> blower1 = bounds(ilower+1);
+ const BBox<T> bupper0 = bounds(iupper-1);
+ BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+ BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+ for (int i = ilower+1; i < iupper; i++)
+ {
+ const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+ const BBox<T> bt = lerp(b0, b1, f);
+ const BBox<T> bi = bounds(i);
+ const T dlower = min(bi.lower-bt.lower, T(zero));
+ const T dupper = max(bi.upper-bt.upper, T(zero));
+ b0.lower += dlower; b1.lower += dlower;
+ b0.upper += dupper; b1.upper += dupper;
+ }
+
+ bounds0 = b0;
+ bounds1 = b1;
+ }
+
+ /*! calculates the linear bounds of a primitive for the specified time range */
+ template<typename BoundsFunc>
+ __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+ {
+ /* normalize global time_range_in to local geom_time_range */
+ const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+ (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+
+ const float lower = time_range.lower*geom_time_segments;
+ const float upper = time_range.upper*geom_time_segments;
+ const float ilowerf = floor(lower);
+ const float iupperf = ceil(upper);
+ const float ilowerfc = max(0.0f,ilowerf);
+ const float iupperfc = min(iupperf,geom_time_segments);
+ const int ilowerc = (int)ilowerfc;
+ const int iupperc = (int)iupperfc;
+ assert(iupperc-ilowerc > 0);
+
+ /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+ const int ilower_iter = max(-1,(int)ilowerf);
+ const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+
+ const BBox<T> blower0 = bounds(ilowerc);
+ const BBox<T> bupper1 = bounds(iupperc);
+ if (iupper_iter-ilower_iter == 1) {
+ bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+ bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+ return;
+ }
+
+ const BBox<T> blower1 = bounds(ilowerc+1);
+ const BBox<T> bupper0 = bounds(iupperc-1);
+ BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+ BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+ for (int i = ilower_iter+1; i < iupper_iter; i++)
+ {
+ const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+ const BBox<T> bt = lerp(b0, b1, f);
+ const BBox<T> bi = bounds(i);
+ const T dlower = min(bi.lower-bt.lower, T(zero));
+ const T dupper = max(bi.upper-bt.upper, T(zero));
+ b0.lower += dlower; b1.lower += dlower;
+ b0.upper += dupper; b1.upper += dupper;
+ }
+
+ bounds0 = b0;
+ bounds1 = b1;
+ }
+
+ /*! calculates the linear bounds of a primitive for the specified time range */
+ template<typename BoundsFunc>
+ __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+ {
+ const int ilower = time_range.begin();
+ const int iupper = time_range.end();
+
+ BBox<T> b0 = bounds(ilower);
+ BBox<T> b1 = bounds(iupper);
+
+ if (iupper-ilower == 1)
+ {
+ bounds0 = b0;
+ bounds1 = b1;
+ return;
+ }
+
+ for (int i = ilower+1; i<iupper; i++)
+ {
+ const float f = float(i - time_range.begin()) / float(time_range.size());
+ const BBox<T> bt = lerp(b0, b1, f);
+ const BBox<T> bi = bounds(i);
+ const T dlower = min(bi.lower-bt.lower, T(zero));
+ const T dupper = max(bi.upper-bt.upper, T(zero));
+ b0.lower += dlower; b1.lower += dlower;
+ b0.upper += dupper; b1.upper += dupper;
+ }
+
+ bounds0 = b0;
+ bounds1 = b1;
+ }
+
+ public:
+
+ __forceinline bool empty() const {
+ return bounds().empty();
+ }
+
+ __forceinline BBox<T> bounds () const {
+ return merge(bounds0,bounds1);
+ }
+
+ __forceinline BBox<T> interpolate( const float t ) const {
+ return lerp(bounds0,bounds1,t);
+ }
+
+ __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+ return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+ }
+
+ __forceinline void extend( const LBBox& other ) {
+ bounds0.extend(other.bounds0);
+ bounds1.extend(other.bounds1);
+ }
+
+ __forceinline float expectedHalfArea() const;
+
+ __forceinline float expectedHalfArea(const BBox1f& dt) const {
+ return interpolate(dt).expectedHalfArea();
+ }
+
+ __forceinline float expectedApproxHalfArea() const {
+ return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+ }
+
+ /* calculates bounds for [0,1] time range from bounds in dt time range */
+ __forceinline LBBox global(const BBox1f& dt) const
+ {
+ const float rcp_dt_size = 1.0f/dt.size();
+ const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+ const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+ return LBBox(b0,b1);
+ }
+
+ /*! Comparison Operators */
+ //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+ //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+ friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+ friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+
+ /*! output operator */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+ return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+ }
+
+ public:
+ BBox<T> bounds0, bounds1;
+ };
+
+ /*! tests if box is finite */
+ template<typename T>
+ __forceinline bool isvalid( const LBBox<T>& v ) {
+ return isvalid(v.bounds0) && isvalid(v.bounds1);
+ }
+
+ template<typename T>
+ __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+ return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+ }
+
+ template<typename T>
+ __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+ {
+ const T da = a1-a0;
+ const T db = b1-b0;
+ return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+ }
+
+ template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const
+ {
+ const Vec3fa d0 = bounds0.size();
+ const Vec3fa d1 = bounds1.size();
+ return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+ Vec3fa(d1.x,d1.y,d1.z),
+ Vec3fa(d0.y,d0.z,d0.x),
+ Vec3fa(d1.y,d1.z,d1.x)));
+ }
+
+ template<typename T>
+ __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+ return box.expectedApproxHalfArea();
+ }
+
+ template<typename T>
+ __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+ return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+ }
+
+ /*! subset relation */
+ template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+ return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+ }
+
+ /*! default template instantiations */
+ typedef LBBox<float> LBBox1f;
+ typedef LBBox<Vec2f> LBBox2f;
+ typedef LBBox<Vec3f> LBBox3f;
+ typedef LBBox<Vec3fa> LBBox3fa;
+ typedef LBBox<Vec3fx> LBBox3fx;
+}
diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h
new file mode 100644
index 0000000000..184ee695fb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace2.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// 2D Linear Transform (2x2 Matrix)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct LinearSpace2
+ {
+ typedef T Vector;
+ typedef typename T::Scalar Scalar;
+
+ /*! default matrix constructor */
+ __forceinline LinearSpace2 ( ) {}
+ __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+ __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+ template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+ /*! matrix construction from column vectors */
+ __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+ : vx(vx), vy(vy) {}
+
+ /*! matrix construction from row mayor data */
+ __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01,
+ const Scalar& m10, const Scalar& m11)
+ : vx(m00,m10), vy(m01,m11) {}
+
+ /*! compute the determinant of the matrix */
+ __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+ /*! compute adjoint matrix */
+ __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+ /*! compute inverse matrix */
+ __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+ /*! compute transposed matrix */
+ __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+ /*! returns first row of matrix */
+ __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+ /*! returns second row of matrix */
+ __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+ __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+ /*! return matrix for scaling */
+ static __forceinline LinearSpace2 scale(const Vector& s) {
+ return LinearSpace2(s.x, 0,
+ 0 , s.y);
+ }
+
+ /*! return matrix for rotation */
+ static __forceinline LinearSpace2 rotate(const Scalar& r) {
+ Scalar s = sin(r), c = cos(r);
+ return LinearSpace2(c, -s,
+ s, c);
+ }
+
+ /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+ LinearSpace2 orthogonal() const
+ {
+ LinearSpace2 m = *this;
+
+ // mirrored?
+ Scalar mirror(one);
+ if (m.det() < Scalar(zero)) {
+ m.vx = -m.vx;
+ mirror = -mirror;
+ }
+
+ // rotation
+ for (int i = 0; i < 99; i++) {
+ const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+ const LinearSpace2 d = m_next - m;
+ m = m_next;
+ // norm^2 of difference small enough?
+ if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+ break;
+ }
+
+ // rotation * mirror_x
+ return LinearSpace2(mirror*m.vx, m.vy);
+ }
+
+ public:
+
+ /*! the column vectors of the matrix */
+ Vector vx,vy;
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+ template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+ template<typename T> __forceinline LinearSpace2<T> rcp ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+ template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+ template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+ template<typename T> __forceinline T operator*(const LinearSpace2<T>& a, const T & b) { return b.x*a.vx + b.y*a.vy; }
+ template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+ template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+ template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+ template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+ template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+ template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+ return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+ }
+
+ /*! Shortcuts for common linear spaces. */
+ typedef LinearSpace2<Vec2f> LinearSpace2f;
+ typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h
new file mode 100644
index 0000000000..9eaa2cc2bb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace3.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// 3D Linear Transform (3x3 Matrix)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct LinearSpace3
+ {
+ typedef T Vector;
+ typedef typename T::Scalar Scalar;
+
+ /*! default matrix constructor */
+ __forceinline LinearSpace3 ( ) {}
+ __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+ __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+ template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+ /*! matrix construction from column vectors */
+ __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+ : vx(vx), vy(vy), vz(vz) {}
+
+ /*! construction from quaternion */
+ __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+ : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+ , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+ , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+ /*! matrix construction from row mayor data */
+ __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+ const Scalar& m10, const Scalar& m11, const Scalar& m12,
+ const Scalar& m20, const Scalar& m21, const Scalar& m22)
+ : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+ /*! compute the determinant of the matrix */
+ __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+ /*! compute adjoint matrix */
+ __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+ /*! compute inverse matrix */
+ __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+ /*! compute transposed matrix */
+ __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+ /*! returns first row of matrix */
+ __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+ /*! returns second row of matrix */
+ __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+ /*! returns third row of matrix */
+ __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+ __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+ /*! return matrix for scaling */
+ static __forceinline LinearSpace3 scale(const Vector& s) {
+ return LinearSpace3(s.x, 0, 0,
+ 0 , s.y, 0,
+ 0 , 0, s.z);
+ }
+
+ /*! return matrix for rotation around arbitrary axis */
+ static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+ Vector u = normalize(_u);
+ Scalar s = sin(r), c = cos(r);
+ return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s,
+ u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s,
+ u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c);
+ }
+
+ public:
+
+ /*! the column vectors of the matrix */
+ Vector vx,vy,vz;
+ };
+
+ /*! compute transposed matrix */
+ template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const {
+ vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+ return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz));
+ }
+
+ template<typename T>
+ __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) {
+ return xfm.transposed();
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+ template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+ template<typename T> __forceinline LinearSpace3<T> rcp ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+ /* constructs a coordinate frame form a normalized normal */
+ template<typename T> __forceinline LinearSpace3<T> frame(const T& N)
+ {
+ const T dx0(0,N.z,-N.y);
+ const T dx1(-N.z,0,N.x);
+ const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+ const T dy = normalize(cross(N,dx));
+ return LinearSpace3<T>(dx,dy,N);
+ }
+
+ /* constructs a coordinate frame from a normal and approximate x-direction */
+ template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+ {
+ if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+ const T dx = normalize(cross(dxi,N));
+ const T dy = normalize(cross(N,dx));
+ return LinearSpace3<T>(dx,dy,N);
+ }
+
+ /* clamps linear space to range -1 to +1 */
+ template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+ return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+ clamp(space.vy,T(-1.0f),T(1.0f)),
+ clamp(space.vz,T(-1.0f),T(1.0f)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+ template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+ template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+ template<typename T> __forceinline T operator*(const LinearSpace3<T>& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+ template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+ template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+ template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+ template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+ template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+ template<typename T> __forceinline T xfmPoint (const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+ template<typename T> __forceinline T xfmVector(const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+ template<typename T> __forceinline T xfmNormal(const LinearSpace3<T>& s, const T & a) { return xfmVector(s.inverse().transposed(),a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+ template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+ return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+ }
+
+ /*! blending */
+ template<typename T>
+ __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t)
+ {
+ return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+ lerp(l0.vy,l1.vy,t),
+ lerp(l0.vz,l1.vz,t));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+ return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+ }
+
+ /*! Shortcuts for common linear spaces. */
+ typedef LinearSpace3<Vec3f> LinearSpace3f;
+ typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+ typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+ typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+ template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+ typedef LinearSpace3<Vec3<vfloat<4>>> LinearSpace3vf4;
+ typedef LinearSpace3<Vec3<vfloat<8>>> LinearSpace3vf8;
+ typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+ /*! blending */
+ template<typename T, typename S>
+ __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+ const LinearSpace3<T>& l1,
+ const S& t)
+ {
+ return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+ lerp(l0.vy,l1.vy,t),
+ lerp(l0.vz,l1.vz,t));
+ }
+
+}
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
new file mode 100644
index 0000000000..4bc54c1a6a
--- /dev/null
+++ b/thirdparty/embree/common/math/math.h
@@ -0,0 +1,369 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__)
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+namespace std
+{
+ __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+ __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+ __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+ __forceinline bool isvalid ( const float& v ) {
+ return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+ }
+
+ __forceinline int cast_f2i(float f) {
+ union { float f; int i; } v; v.f = f; return v.i;
+ }
+
+ __forceinline float cast_i2f(int i) {
+ union { float f; int i; } v; v.i = i; return v.f;
+ }
+
+ __forceinline int toInt (const float& a) { return int(a); }
+ __forceinline float toFloat(const int& a) { return float(a); }
+
+#if defined(__WIN32__)
+ __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+ __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+ __forceinline float sqr ( const float x ) { return x*x; }
+
+ __forceinline float rcp ( const float x )
+ {
+ const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+ const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+ const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+ return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+ return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+ }
+
+ __forceinline float signmsk ( const float x ) {
+ return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+ }
+ __forceinline float xorf( const float x, const float y ) {
+ return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+ }
+ __forceinline float andf( const float x, const unsigned y ) {
+ return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+ }
+ __forceinline float rsqrt( const float x )
+ {
+ const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+ __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+ __m128 r = _mm_rsqrt_ss(a);
+#endif
+ r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#if defined(__ARM_NEON)
+ r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#endif
+ return _mm_cvtss_f32(r);
+ }
+
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
+ __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+ __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+ __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+ __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+ __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+ __forceinline float abs ( const float x ) { return ::fabsf(x); }
+ __forceinline float acos ( const float x ) { return ::acosf (x); }
+ __forceinline float asin ( const float x ) { return ::asinf (x); }
+ __forceinline float atan ( const float x ) { return ::atanf (x); }
+ __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+ __forceinline float cos ( const float x ) { return ::cosf (x); }
+ __forceinline float cosh ( const float x ) { return ::coshf (x); }
+ __forceinline float exp ( const float x ) { return ::expf (x); }
+ __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+ __forceinline float log ( const float x ) { return ::logf (x); }
+ __forceinline float log10( const float x ) { return ::log10f(x); }
+ __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); }
+ __forceinline float sin ( const float x ) { return ::sinf (x); }
+ __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+ __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+ __forceinline float tan ( const float x ) { return ::tanf (x); }
+ __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+ __forceinline float floor( const float x ) { return ::floorf (x); }
+ __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+ __forceinline float frac ( const float x ) { return x-floor(x); }
+
+ __forceinline double abs ( const double x ) { return ::fabs(x); }
+ __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+ __forceinline double acos ( const double x ) { return ::acos (x); }
+ __forceinline double asin ( const double x ) { return ::asin (x); }
+ __forceinline double atan ( const double x ) { return ::atan (x); }
+ __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+ __forceinline double cos ( const double x ) { return ::cos (x); }
+ __forceinline double cosh ( const double x ) { return ::cosh (x); }
+ __forceinline double exp ( const double x ) { return ::exp (x); }
+ __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+ __forceinline double log ( const double x ) { return ::log (x); }
+ __forceinline double log10( const double x ) { return ::log10(x); }
+ __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); }
+ __forceinline double rcp ( const double x ) { return 1.0/x; }
+ __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+ __forceinline double sin ( const double x ) { return ::sin (x); }
+ __forceinline double sinh ( const double x ) { return ::sinh (x); }
+ __forceinline double sqr ( const double x ) { return x*x; }
+ __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+ __forceinline double tan ( const double x ) { return ::tan (x); }
+ __forceinline double tanh ( const double x ) { return ::tanh (x); }
+ __forceinline double floor( const double x ) { return ::floor (x); }
+ __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__SSE4_1__)
+ __forceinline float mini(float a, float b) {
+ const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+ const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+ const __m128i ci = _mm_min_epi32(ai,bi);
+ return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ __forceinline float maxi(float a, float b) {
+ const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+ const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+ const __m128i ci = _mm_max_epi32(ai,bi);
+ return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+ }
+#endif
+
+ template<typename T>
+ __forceinline T twice(const T& a) { return a+a; }
+
+ __forceinline int min(int a, int b) { return a<b ? a:b; }
+ __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+ __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
+ __forceinline float min(float a, float b) { return a<b ? a:b; }
+ __forceinline double min(double a, double b) { return a<b ? a:b; }
+#if defined(__64BIT__)
+ __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
+#endif
+
+ template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+ template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+ template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+ template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+ template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+ template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+ __forceinline int max(int a, int b) { return a<b ? b:a; }
+ __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+ __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
+ __forceinline float max(float a, float b) { return a<b ? b:a; }
+ __forceinline double max(double a, double b) { return a<b ? b:a; }
+#if defined(__64BIT__)
+ __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
+#endif
+
+ template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+ template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+ template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+ template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+ template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+ template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+ __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+ __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+ __forceinline void sincosf(float x, float *sin, float *cos) {
+ __sincosf(x,sin,cos);
+ }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+ __forceinline void sincosf(float x, float *s, float *c) {
+ *s = sinf(x); *c = cosf(x);
+ }
+#endif
+
+ template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+ template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+ template<typename T> __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); }
+ template<typename T> __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); }
+ template<typename T> __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); }
+ template<typename T> __forceinline T cos2sin ( const T& x ) { return sin2cos(x); }
+
+#if defined(__AVX2__)
+ __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+ __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+ __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+ __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#else
+ __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
+ __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
+ __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+ __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+ /*! random functions */
+ template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+ template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+ template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+ template<> __forceinline int random() { return int(rand()); }
+ template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+ template<> __forceinline float random() { return rand()/float(RAND_MAX); }
+ template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+ __forceinline double drand48() {
+ return double(rand())/double(RAND_MAX);
+ }
+
+ __forceinline void srand48(long seed) {
+ return srand(seed);
+ }
+#endif
+
+ /*! selects */
+ __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; }
+ __forceinline int select(bool s, int t, int f) { return s ? t : f; }
+ __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+ __forceinline bool all(bool s) { return s; }
+
+ __forceinline float lerp(const float v0, const float v1, const float t) {
+ return madd(1.0f-t,v0,t*v1);
+ }
+
+ template<typename T>
+ __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+ return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+ }
+
+ /*! exchange */
+ template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+ /* load/store */
+ template<typename Ty> struct mem;
+
+ template<> struct mem<float> {
+ static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+ static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+
+ static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+ static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+ };
+
+ /*! bit reverse operation */
+ template<class T>
+ __forceinline T bitReverse(const T& vin)
+ {
+ T v = vin;
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ v = ( v >> 16 ) | ( v << 16);
+ return v;
+ }
+
+ /*! bit interleave operation */
+ template<class T>
+ __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+ {
+ T x = xin, y = yin, z = zin;
+ x = (x | (x << 16)) & 0x030000FF;
+ x = (x | (x << 8)) & 0x0300F00F;
+ x = (x | (x << 4)) & 0x030C30C3;
+ x = (x | (x << 2)) & 0x09249249;
+
+ y = (y | (y << 16)) & 0x030000FF;
+ y = (y | (y << 8)) & 0x0300F00F;
+ y = (y | (y << 4)) & 0x030C30C3;
+ y = (y | (y << 2)) & 0x09249249;
+
+ z = (z | (z << 16)) & 0x030000FF;
+ z = (z | (z << 8)) & 0x0300F00F;
+ z = (z | (z << 4)) & 0x030C30C3;
+ z = (z | (z << 2)) & 0x09249249;
+
+ return x | (y << 1) | (z << 2);
+ }
+
+#if defined(__AVX2__)
+
+ template<>
+ __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+ {
+ const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+ const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+ const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+ return xx | yy | zz;
+ }
+
+#endif
+
+ /*! bit interleave operation for 64bit data types*/
+ template<class T>
+ __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+ T x = xin & 0x1fffff;
+ T y = yin & 0x1fffff;
+ T z = zin & 0x1fffff;
+
+ x = (x | x << 32) & 0x1f00000000ffff;
+ x = (x | x << 16) & 0x1f0000ff0000ff;
+ x = (x | x << 8) & 0x100f00f00f00f00f;
+ x = (x | x << 4) & 0x10c30c30c30c30c3;
+ x = (x | x << 2) & 0x1249249249249249;
+
+ y = (y | y << 32) & 0x1f00000000ffff;
+ y = (y | y << 16) & 0x1f0000ff0000ff;
+ y = (y | y << 8) & 0x100f00f00f00f00f;
+ y = (y | y << 4) & 0x10c30c30c30c30c3;
+ y = (y | y << 2) & 0x1249249249249249;
+
+ z = (z | z << 32) & 0x1f00000000ffff;
+ z = (z | z << 16) & 0x1f0000ff0000ff;
+ z = (z | z << 8) & 0x100f00f00f00f00f;
+ z = (z | z << 4) & 0x10c30c30c30c30c3;
+ z = (z | z << 2) & 0x1249249249249249;
+
+ return x | (y << 1) | (z << 2);
+ }
+}
diff --git a/thirdparty/embree/common/math/obbox.h b/thirdparty/embree/common/math/obbox.h
new file mode 100644
index 0000000000..2fe8bbf071
--- /dev/null
+++ b/thirdparty/embree/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+ /*! Oriented bounding box */
+ template<typename T>
+ struct OBBox
+ {
+ public:
+
+ __forceinline OBBox () {}
+
+ __forceinline OBBox (EmptyTy)
+ : space(one), bounds(empty) {}
+
+ __forceinline OBBox (const BBox<T>& bounds)
+ : space(one), bounds(bounds) {}
+
+ __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds)
+ : space(space), bounds(bounds) {}
+
+ friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+ return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+ }
+
+ public:
+ LinearSpace3<T> space; //!< orthonormal transformation
+ BBox<T> bounds; //!< bounds in transformed space
+ };
+
+ typedef OBBox<Vec3f> OBBox3f;
+ typedef OBBox<Vec3fa> OBBox3fa;
+}
diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h
new file mode 100644
index 0000000000..080800efcd
--- /dev/null
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -0,0 +1,254 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////
+ // Quaternion Struct
+ ////////////////////////////////////////////////////////////////
+
+ template<typename T>
+ struct QuaternionT
+ {
+ typedef Vec3<T> Vector;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline QuaternionT () { }
+ __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+ __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+ __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {}
+ __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+ __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+ __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+ __forceinline QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+ __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+ __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+ __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {}
+
+ /*! return quaternion for rotation around arbitrary axis */
+ static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+ return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+ }
+
+ /*! returns the rotation axis of the quaternion as a vector */
+ __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+ public:
+ T r, i, j, k;
+ };
+
+ template<typename T> __forceinline QuaternionT<T> operator *( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+ template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+ ////////////////////////////////////////////////////////////////
+ // Unary Operators
+ ////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+ template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+ template<typename T> __forceinline QuaternionT<T> conj ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+ template<typename T> __forceinline T abs ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+ template<typename T> __forceinline QuaternionT<T> rcp ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+ template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+ // evaluates a*q-r
+ template<typename T> __forceinline QuaternionT<T>
+ msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+ {
+ return QuaternionT<T>(msub(a, q.r, p.r),
+ msub(a, q.i, p.i),
+ msub(a, q.j, p.j),
+ msub(a, q.k, p.k));
+ }
+ // evaluates a*q-r
+ template<typename T> __forceinline QuaternionT<T>
+ madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+ {
+ return QuaternionT<T>(madd(a, q.r, p.r),
+ madd(a, q.i, p.i),
+ madd(a, q.j, p.j),
+ madd(a, q.k, p.k));
+ }
+
+ ////////////////////////////////////////////////////////////////
+ // Binary Operators
+ ////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline QuaternionT<T> operator +( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r, b.i, b.j, b.k); }
+ template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+ template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+ template<typename T> __forceinline QuaternionT<T> operator -( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+ template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+ template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+ template<typename T> __forceinline Vec3<T> operator *( const QuaternionT<T>& a, const Vec3<T> & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+ template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+ return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+ a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+ a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+ a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+ }
+ template<typename T> __forceinline QuaternionT<T> operator /( const T & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+ template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T & b ) { return a*rcp(b); }
+ template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+ template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T & b ) { return a = a+b; }
+ template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+ template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T & b ) { return a = a-b; }
+ template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+ template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T & b ) { return a = a*b; }
+ template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+ template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T & b ) { return a = a*rcp(b); }
+ template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+ template<typename T, typename M> __forceinline QuaternionT<T>
+ select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+ {
+ return QuaternionT<T>(select(m, q.r, p.r),
+ select(m, q.i, p.i),
+ select(m, q.j, p.j),
+ select(m, q.k, p.k));
+ }
+
+
+ template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+ template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+ template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+ template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+ template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Orientation Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+ {
+ if ( vx.x + vy.y + vz.z >= T(zero) )
+ {
+ const T t = T(one) + (vx.x + vy.y + vz.z);
+ const T s = rsqrt(t)*T(0.5f);
+ r = t*s;
+ i = (vy.z - vz.y)*s;
+ j = (vz.x - vx.z)*s;
+ k = (vx.y - vy.x)*s;
+ }
+ else if ( vx.x >= max(vy.y, vz.z) )
+ {
+ const T t = (T(one) + vx.x) - (vy.y + vz.z);
+ const T s = rsqrt(t)*T(0.5f);
+ r = (vy.z - vz.y)*s;
+ i = t*s;
+ j = (vx.y + vy.x)*s;
+ k = (vz.x + vx.z)*s;
+ }
+ else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+ {
+ const T t = (T(one) + vy.y) - (vz.z + vx.x);
+ const T s = rsqrt(t)*T(0.5f);
+ r = (vz.x - vx.z)*s;
+ i = (vx.y + vy.x)*s;
+ j = t*s;
+ k = (vy.z + vz.y)*s;
+ }
+ else //if ( vz.z >= max(vy.y, vx.x) )
+ {
+ const T t = (T(one) + vz.z) - (vx.x + vy.y);
+ const T s = rsqrt(t)*T(0.5f);
+ r = (vx.y - vy.x)*s;
+ i = (vz.x + vx.z)*s;
+ j = (vy.z + vz.y)*s;
+ k = t*s;
+ }
+ }
+
+ template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+ {
+ const T cya = cos(yaw *T(0.5f));
+ const T cpi = cos(pitch*T(0.5f));
+ const T cro = cos(roll *T(0.5f));
+ const T sya = sin(yaw *T(0.5f));
+ const T spi = sin(pitch*T(0.5f));
+ const T sro = sin(roll *T(0.5f));
+ r = cro*cya*cpi + sro*sya*spi;
+ i = cro*cya*spi + sro*sya*cpi;
+ j = cro*sya*cpi - sro*cya*spi;
+ k = sro*cya*cpi - cro*sya*spi;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ //////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+ return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+ }
+
+ /*! default template instantiations */
+ typedef QuaternionT<float> Quaternion3f;
+ typedef QuaternionT<double> Quaternion3d;
+
+ template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+ typedef QuaternionT<vfloat<4>> Quaternion3vf4;
+ typedef QuaternionT<vfloat<8>> Quaternion3vf8;
+ typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+ //////////////////////////////////////////////////////////////////////////////
+ /// Interpolation
+ //////////////////////////////////////////////////////////////////////////////
+ template<typename T>
+ __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+ const QuaternionT<T>& q1,
+ const T& factor)
+ {
+ QuaternionT<T> q;
+ q.r = lerp(q0.r, q1.r, factor);
+ q.i = lerp(q0.i, q1.i, factor);
+ q.j = lerp(q0.j, q1.j, factor);
+ q.k = lerp(q0.k, q1.k, factor);
+ return q;
+ }
+
+ template<typename T>
+ __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+ const QuaternionT<T>& q1_,
+ const T& t)
+ {
+ T cosTheta = dot(q0, q1_);
+ QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+ cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta);
+ if (unlikely(all(cosTheta > 0.9995f))) {
+ return normalize(lerp(q0, q1, t));
+ }
+ const T phi = t * fastapprox::acos(cosTheta);
+ T sinPhi, cosPhi;
+ fastapprox::sincos(phi, sinPhi, cosPhi);
+ QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+ return msub(cosPhi, q0, qperp);
+ }
+}
diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h
new file mode 100644
index 0000000000..909fadb995
--- /dev/null
+++ b/thirdparty/embree/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+ template<typename Ty>
+ struct range
+ {
+ __forceinline range() {}
+
+ __forceinline range(const Ty& begin)
+ : _begin(begin), _end(begin+1) {}
+
+ __forceinline range(const Ty& begin, const Ty& end)
+ : _begin(begin), _end(end) {}
+
+ __forceinline range(const range& other)
+ : _begin(other._begin), _end(other._end) {}
+
+ template<typename T1>
+ __forceinline range(const range<T1>& other)
+ : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+ template<typename T1>
+ __forceinline range& operator =(const range<T1>& other) {
+ _begin = other._begin;
+ _end = other._end;
+ return *this;
+ }
+
+ __forceinline Ty begin() const {
+ return _begin;
+ }
+
+ __forceinline Ty end() const {
+ return _end;
+ }
+
+ __forceinline range intersect(const range& r) const {
+ return range (max(_begin,r._begin),min(_end,r._end));
+ }
+
+ __forceinline Ty size() const {
+ return _end - _begin;
+ }
+
+ __forceinline bool empty() const {
+ return _end <= _begin;
+ }
+
+ __forceinline Ty center() const {
+ return (_begin + _end)/2;
+ }
+
+ __forceinline std::pair<range,range> split() const
+ {
+ const Ty _center = center();
+ return std::make_pair(range(_begin,_center),range(_center,_end));
+ }
+
+ __forceinline void split(range& left_o, range& right_o) const
+ {
+ const Ty _center = center();
+ left_o = range(_begin,_center);
+ right_o = range(_center,_end);
+ }
+
+ __forceinline friend bool operator< (const range& r0, const range& r1) {
+ return r0.size() < r1.size();
+ }
+
+ friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+ return cout << "range [" << r.begin() << ", " << r.end() << "]";
+ }
+
+ Ty _begin, _end;
+ };
+
+ template<typename Ty>
+ range<Ty> make_range(const Ty& begin, const Ty& end) {
+ return range<Ty>(begin,end);
+ }
+
+ template<typename Ty>
+ struct extended_range : public range<Ty>
+ {
+ __forceinline extended_range () {}
+
+ __forceinline extended_range (const Ty& begin)
+ : range<Ty>(begin), _ext_end(begin+1) {}
+
+ __forceinline extended_range (const Ty& begin, const Ty& end)
+ : range<Ty>(begin,end), _ext_end(end) {}
+
+ __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+ : range<Ty>(begin,end), _ext_end(ext_end) {}
+
+ __forceinline Ty ext_end() const {
+ return _ext_end;
+ }
+
+ __forceinline Ty ext_size() const {
+ return _ext_end - range<Ty>::_begin;
+ }
+
+ __forceinline Ty ext_range_size() const {
+ return _ext_end - range<Ty>::_end;
+ }
+
+ __forceinline bool has_ext_range() const {
+ assert(_ext_end >= range<Ty>::_end);
+ return (_ext_end - range<Ty>::_end) > 0;
+ }
+
+ __forceinline void set_ext_range(const size_t ext_end){
+ assert(ext_end >= range<Ty>::_end);
+ _ext_end = ext_end;
+ }
+
+ __forceinline void move_right(const size_t plus){
+ range<Ty>::_begin += plus;
+ range<Ty>::_end += plus;
+ _ext_end += plus;
+ }
+
+ friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+ return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]";
+ }
+
+ Ty _ext_end;
+ };
+}
diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h
new file mode 100644
index 0000000000..fd16c26e81
--- /dev/null
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+ static const float piOverTwoVec = 1.57079637050628662109375;
+ static const float twoOverPiVec = 0.636619746685028076171875;
+ auto scaled = v * twoOverPiVec;
+ auto kReal = floor(scaled);
+ auto k = toInt(kReal);
+
+ // Reduced range version of x
+ auto x = v - kReal * piOverTwoVec;
+ auto kMod4 = k & 3;
+ auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+ auto flipSign = (kMod4 > 1);
+
+ // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+ // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+ static const float sinC2 = -0.16666667163372039794921875;
+ static const float sinC4 = +8.333347737789154052734375e-3;
+ static const float sinC6 = -1.9842604524455964565277099609375e-4;
+ static const float sinC8 = +2.760012648650445044040679931640625e-6;
+ static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+ static const float cosC2 = -0.5;
+ static const float cosC4 = +4.166664183139801025390625e-2;
+ static const float cosC6 = -1.388833043165504932403564453125e-3;
+ static const float cosC8 = +2.47562347794882953166961669921875e-5;
+ static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+ auto outside = select(sinUseCos, 1., x);
+ auto c2 = select(sinUseCos, T(cosC2), T(sinC2));
+ auto c4 = select(sinUseCos, T(cosC4), T(sinC4));
+ auto c6 = select(sinUseCos, T(cosC6), T(sinC6));
+ auto c8 = select(sinUseCos, T(cosC8), T(sinC8));
+ auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+ auto x2 = x * x;
+ auto formula = x2 * c10 + c8;
+ formula = x2 * formula + c6;
+ formula = x2 * formula + c4;
+ formula = x2 * formula + c2;
+ formula = x2 * formula + 1.;
+ formula *= outside;
+
+ formula = select(flipSign, -formula, formula);
+ return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+ static const float piOverTwoVec = 1.57079637050628662109375;
+ static const float twoOverPiVec = 0.636619746685028076171875;
+ auto scaled = v * twoOverPiVec;
+ auto kReal = floor(scaled);
+ auto k = toInt(kReal);
+
+ // Reduced range version of x
+ auto x = v - kReal * piOverTwoVec;
+
+ auto kMod4 = k & 3;
+ auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+ auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+ const float sinC2 = -0.16666667163372039794921875;
+ const float sinC4 = +8.333347737789154052734375e-3;
+ const float sinC6 = -1.9842604524455964565277099609375e-4;
+ const float sinC8 = +2.760012648650445044040679931640625e-6;
+ const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+ const float cosC2 = -0.5;
+ const float cosC4 = +4.166664183139801025390625e-2;
+ const float cosC6 = -1.388833043165504932403564453125e-3;
+ const float cosC8 = +2.47562347794882953166961669921875e-5;
+ const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+ auto outside = select(cosUseCos, 1., x);
+ auto c2 = select(cosUseCos, T(cosC2), T(sinC2));
+ auto c4 = select(cosUseCos, T(cosC4), T(sinC4));
+ auto c6 = select(cosUseCos, T(cosC6), T(sinC6));
+ auto c8 = select(cosUseCos, T(cosC8), T(sinC8));
+ auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+ auto x2 = x * x;
+ auto formula = x2 * c10 + c8;
+ formula = x2 * formula + c6;
+ formula = x2 * formula + c4;
+ formula = x2 * formula + c2;
+ formula = x2 * formula + 1.;
+ formula *= outside;
+
+ formula = select(flipSign, -formula, formula);
+ return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+ const float piOverTwoVec = 1.57079637050628662109375;
+ const float twoOverPiVec = 0.636619746685028076171875;
+ auto scaled = v * twoOverPiVec;
+ auto kReal = floor(scaled);
+ auto k = toInt(kReal);
+
+ // Reduced range version of x
+ auto x = v - kReal * piOverTwoVec;
+ auto kMod4 = k & 3;
+ auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+ auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+ auto sinFlipSign = (kMod4 > 1);
+ auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+ const float oneVec = +1.;
+ const float sinC2 = -0.16666667163372039794921875;
+ const float sinC4 = +8.333347737789154052734375e-3;
+ const float sinC6 = -1.9842604524455964565277099609375e-4;
+ const float sinC8 = +2.760012648650445044040679931640625e-6;
+ const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+ const float cosC2 = -0.5;
+ const float cosC4 = +4.166664183139801025390625e-2;
+ const float cosC6 = -1.388833043165504932403564453125e-3;
+ const float cosC8 = +2.47562347794882953166961669921875e-5;
+ const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+ auto x2 = x * x;
+
+ auto sinFormula = x2 * sinC10 + sinC8;
+ auto cosFormula = x2 * cosC10 + cosC8;
+ sinFormula = x2 * sinFormula + sinC6;
+ cosFormula = x2 * cosFormula + cosC6;
+
+ sinFormula = x2 * sinFormula + sinC4;
+ cosFormula = x2 * cosFormula + cosC4;
+
+ sinFormula = x2 * sinFormula + sinC2;
+ cosFormula = x2 * cosFormula + cosC2;
+
+ sinFormula = x2 * sinFormula + oneVec;
+ cosFormula = x2 * cosFormula + oneVec;
+
+ sinFormula *= x;
+
+ sinResult = select(sinUseCos, cosFormula, sinFormula);
+ cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+ sinResult = select(sinFlipSign, -sinResult, sinResult);
+ cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+ const float piOverFourVec = 0.785398185253143310546875;
+ const float fourOverPiVec = 1.27323949337005615234375;
+
+ auto xLt0 = v < 0.;
+ auto y = select(xLt0, -v, v);
+ auto scaled = y * fourOverPiVec;
+
+ auto kReal = floor(scaled);
+ auto k = toInt(kReal);
+
+ auto x = y - kReal * piOverFourVec;
+
+ // If k & 1, x -= Pi/4
+ auto needOffset = (k & 1) != 0;
+ x = select(needOffset, x - piOverFourVec, x);
+
+ // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+ auto kMod4 = k & 3;
+ auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+ const float oneVec = 1.0;
+
+ const float tanC2 = +0.33333075046539306640625;
+ const float tanC4 = +0.13339905440807342529296875;
+ const float tanC6 = +5.3348250687122344970703125e-2;
+ const float tanC8 = +2.46033705770969390869140625e-2;
+ const float tanC10 = +2.892402000725269317626953125e-3;
+ const float tanC12 = +9.500005282461643218994140625e-3;
+
+ const float cotC2 = -0.3333333432674407958984375;
+ const float cotC4 = -2.222204394638538360595703125e-2;
+ const float cotC6 = -2.11752182804048061370849609375e-3;
+ const float cotC8 = -2.0846328698098659515380859375e-4;
+ const float cotC10 = -2.548247357481159269809722900390625e-5;
+ const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+ auto x2 = x * x;
+ T z;
+ if (any(useCotan))
+ {
+ auto cotVal = x2 * cotC12 + cotC10;
+ cotVal = x2 * cotVal + cotC8;
+ cotVal = x2 * cotVal + cotC6;
+ cotVal = x2 * cotVal + cotC4;
+ cotVal = x2 * cotVal + cotC2;
+ cotVal = x2 * cotVal + oneVec;
+ // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+ cotVal /= -x;
+ z = cotVal;
+ }
+ auto useTan = !useCotan;
+ if (any(useTan))
+ {
+ auto tanVal = x2 * tanC12 + tanC10;
+ tanVal = x2 * tanVal + tanC8;
+ tanVal = x2 * tanVal + tanC6;
+ tanVal = x2 * tanVal + tanC4;
+ tanVal = x2 * tanVal + tanC2;
+ tanVal = x2 * tanVal + oneVec;
+ // Equation was for tan(x)/x
+ tanVal *= x;
+ z = select(useTan, tanVal, z);
+ }
+ return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+ auto isneg = (x0 < 0.f);
+ auto x = abs(x0);
+ auto isnan = (x > 1.f);
+
+ // sollya
+ // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+ // [1e-20;.9999999999999999]);
+ // avg error: 1.1105439e-06, max error 1.3187528e-06
+ auto v = 1.57079517841339111328125f +
+ x * (-0.21450997889041900634765625f +
+ x * (8.78556668758392333984375e-2f +
+ x * (-4.489909112453460693359375e-2f +
+ x * (1.928029954433441162109375e-2f +
+ x * (-4.3095736764371395111083984375e-3f)))));
+
+ v *= -sqrt(1.f - x);
+ v = v + 1.57079637050628662109375f;
+
+ v = select(v < 0.f, T(0.f), v);
+ v = select(isneg, -v, v);
+ v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+ return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+ return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+ const float piOverTwoVec = 1.57079637050628662109375;
+ // atan(-x) = -atan(x) (so flip from negative to positive first)
+ // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+ auto xNeg = v < 0.f;
+ auto xFlipped = select(xNeg, -v, v);
+
+ auto xGt1 = xFlipped > 1.;
+ auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+ // These coefficients approximate atan(x)/x
+ const float atanC0 = +0.99999988079071044921875;
+ const float atanC2 = -0.3333191573619842529296875;
+ const float atanC4 = +0.199689209461212158203125;
+ const float atanC6 = -0.14015688002109527587890625;
+ const float atanC8 = +9.905083477497100830078125e-2;
+ const float atanC10 = -5.93664981424808502197265625e-2;
+ const float atanC12 = +2.417283318936824798583984375e-2;
+ const float atanC14 = -4.6721356920897960662841796875e-3;
+
+ auto x2 = x * x;
+ auto result = x2 * atanC14 + atanC12;
+ result = x2 * result + atanC10;
+ result = x2 * result + atanC8;
+ result = x2 * result + atanC6;
+ result = x2 * result + atanC4;
+ result = x2 * result + atanC2;
+ result = x2 * result + atanC0;
+ result *= x;
+
+ result = select(xGt1, piOverTwoVec - result, result);
+ result = select(xNeg, -result, result);
+ return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+ const float piVec = 3.1415926536;
+ // atan2(y, x) =
+ //
+ // atan2(y > 0, x = +-0) -> Pi/2
+ // atan2(y < 0, x = +-0) -> -Pi/2
+ // atan2(y = +-0, x < +0) -> +-Pi
+ // atan2(y = +-0, x >= +0) -> +-0
+ //
+ // atan2(y >= 0, x < 0) -> Pi + atan(y/x)
+ // atan2(y < 0, x < 0) -> -Pi + atan(y/x)
+ // atan2(y, x > 0) -> atan(y/x)
+ //
+ // and then a bunch of code for dealing with infinities.
+ auto yOverX = y * rcpSafe(x);
+ auto atanArg = atan(yOverX);
+ auto xLt0 = x < 0.f;
+ auto yLt0 = y < 0.f;
+ auto offset = select(xLt0,
+ select(yLt0, T(-piVec), T(piVec)), 0.f);
+ return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+ const float ln2Part1 = 0.6931457519;
+ const float ln2Part2 = 1.4286067653e-6;
+ const float oneOverLn2 = 1.44269502162933349609375;
+
+ auto scaled = v * oneOverLn2;
+ auto kReal = floor(scaled);
+ auto k = toInt(kReal);
+
+ // Reduced range version of x
+ auto x = v - kReal * ln2Part1;
+ x -= kReal * ln2Part2;
+
+ // These coefficients are for e^x in [0, ln(2)]
+ const float one = 1.;
+ const float c2 = 0.4999999105930328369140625;
+ const float c3 = 0.166668415069580078125;
+ const float c4 = 4.16539050638675689697265625e-2;
+ const float c5 = 8.378830738365650177001953125e-3;
+ const float c6 = 1.304379315115511417388916015625e-3;
+ const float c7 = 2.7555381529964506626129150390625e-4;
+
+ auto result = x * c7 + c6;
+ result = x * result + c5;
+ result = x * result + c4;
+ result = x * result + c3;
+ result = x * result + c2;
+ result = x * result + one;
+ result = x * result + one;
+
+ // Compute 2^k (should differ for float and double, but I'll avoid
+ // it for now and just do floats)
+ const int fpbias = 127;
+ auto biasedN = k + fpbias;
+ auto overflow = kReal > fpbias;
+ // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+ // we've got underflow. -127 * ln(2) -> -88.02. So the most
+ // negative float input that doesn't result in zero is like -88.
+ auto underflow = kReal <= -fpbias;
+ const int infBits = 0x7f800000;
+ biasedN <<= 23;
+ // Reinterpret this thing as float
+ auto twoToTheN = asFloat(biasedN);
+ // Handle both doubles and floats (hopefully eliding the copy for float)
+ auto elemtype2n = twoToTheN;
+ result *= elemtype2n;
+ result = select(overflow, cast_i2f(infBits), result);
+ result = select(underflow, 0., result);
+ return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+ T &reduced,
+ R &exponent)
+{
+ auto intVersion = asInt(input);
+ // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+ // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
+ // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
+ // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
+ // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
+
+ //const int exponentMask(0x7F800000)
+ static const int nonexponentMask = 0x807FFFFF;
+
+ // We want the reduced version to have an exponent of -1 which is
+ // -1 + 127 after biasing or 126
+ static const int exponentNeg1 = (126l << 23);
+ // NOTE(boulos): We don't need to mask anything out since we know
+ // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+ // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+ auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+ auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+ exponent = offsetExponent - 127; // get the real value
+
+ // Blend the offset_exponent with the original input (do this in
+ // int for now, until I decide if float can have & and &not)
+ auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+ reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType { };
+template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
+template <> struct ExponentType<float> { typedef int Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+ T reduced;
+ typename ExponentType<T>::Ty exponent;
+
+ const int nanBits = 0x7fc00000;
+ const int negInfBits = 0xFF800000;
+ const float nan = cast_i2f(nanBits);
+ const float negInf = cast_i2f(negInfBits);
+ auto useNan = v < 0.;
+ auto useInf = v == 0.;
+ auto exceptional = useNan | useInf;
+ const float one = 1.0;
+
+ auto patched = select(exceptional, one, v);
+ __rangeReduceLog(patched, reduced, exponent);
+
+ const float ln2 = 0.693147182464599609375;
+
+ auto x1 = one - reduced;
+ const float c1 = +0.50000095367431640625;
+ const float c2 = +0.33326041698455810546875;
+ const float c3 = +0.2519190013408660888671875;
+ const float c4 = +0.17541764676570892333984375;
+ const float c5 = +0.3424419462680816650390625;
+ const float c6 = -0.599632322788238525390625;
+ const float c7 = +1.98442304134368896484375;
+ const float c8 = -2.4899270534515380859375;
+ const float c9 = +1.7491014003753662109375;
+
+ auto result = x1 * c9 + c8;
+ result = x1 * result + c7;
+ result = x1 * result + c6;
+ result = x1 * result + c5;
+ result = x1 * result + c4;
+ result = x1 * result + c3;
+ result = x1 * result + c2;
+ result = x1 * result + c1;
+ result = x1 * result + one;
+
+ // Equation was for -(ln(red)/(1-red))
+ result *= -x1;
+ result += toFloat(exponent) * ln2;
+
+ return select(exceptional,
+ select(useNan, T(nan), T(negInf)),
+ result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+ auto x1 = abs(x);
+ auto z = exp(y * log(x1));
+
+ // Handle special cases
+ const float twoOver23 = 8388608.0f;
+ auto yInt = y == round(y);
+ auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+ // x == 0
+ z = select(x == 0.0f,
+ select(y < 0.0f, T(inf) | signmsk(x),
+ select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+ // x < 0
+ auto xNegative = x < 0.0f;
+ if (any(xNegative))
+ {
+ auto z1 = z | asFloat(yOddInt);
+ z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+ z = select(xNegative, z1, z);
+ }
+
+ auto xFinite = isfinite(x);
+ auto yFinite = isfinite(y);
+ if (all(xFinite & yFinite))
+ return z;
+
+ // x finite and y infinite
+ z = select(andn(xFinite, yFinite),
+ select(x1 == 1.0f, 1.0f,
+ select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+ // x infinite
+ z = select(xFinite, z,
+ select(y == 0.0f, 1.0f,
+ select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+ return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+ return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
new file mode 100644
index 0000000000..d62aef51f3
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+ struct Vec2fa;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Generic 2D vector Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct Vec2
+ {
+ enum { N = 2 };
+ union {
+ struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+ T components[N];
+#endif
+ };
+
+ typedef T Scalar;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2( ) {}
+ __forceinline explicit Vec2( const T& a ) : x(a), y(a) {}
+ __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+ __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+ __forceinline Vec2( const Vec2fa& other );
+
+ template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+ template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+ __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {}
+ __forceinline Vec2( OneTy ) : x(one), y(one) {}
+ __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+ __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+ __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+ __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; }
+#else
+ __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+ __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; }
+#endif
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+ template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+ template<typename T> __forceinline Vec2<T> abs ( const Vec2<T>& a ) { return Vec2<T>(abs (a.x), abs (a.y)); }
+ template<typename T> __forceinline Vec2<T> rcp ( const Vec2<T>& a ) { return Vec2<T>(rcp (a.x), rcp (a.y)); }
+ template<typename T> __forceinline Vec2<T> rsqrt ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+ template<typename T> __forceinline Vec2<T> sqrt ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+ template<typename T> __forceinline Vec2<T> frac ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+ template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x + b , a.y + b ); }
+ template<typename T> __forceinline Vec2<T> operator +( const T& a, const Vec2<T>& b ) { return Vec2<T>(a + b.x, a + b.y); }
+ template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+ template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x - b , a.y - b ); }
+ template<typename T> __forceinline Vec2<T> operator -( const T& a, const Vec2<T>& b ) { return Vec2<T>(a - b.x, a - b.y); }
+ template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+ template<typename T> __forceinline Vec2<T> operator *( const T& a, const Vec2<T>& b ) { return Vec2<T>(a * b.x, a * b.y); }
+ template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x * b , a.y * b ); }
+ template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+ template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x / b , a.y / b ); }
+ template<typename T> __forceinline Vec2<T> operator /( const T& a, const Vec2<T>& b ) { return Vec2<T>(a / b.x, a / b.y); }
+
+ template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+ template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T> madd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> msub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+ template<typename T> __forceinline Vec2<T> madd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> msub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+ template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+ template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+ template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const T& b ) { a.x *= b ; a.y *= b ; return a; }
+ template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const T& b ) { a.x /= b ; a.y /= b ; return a; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+ template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+ template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+ template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+ template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+ template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+ if (a.x != b.x) return a.x < b.x;
+ if (a.y != b.y) return a.y < b.y;
+ return false;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Shift Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+ return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+ template<typename T> __forceinline Vec2<T> cross ( const Vec2<T>& a ) { return Vec2<T>(-a.y,a.x); }
+ template<typename T> __forceinline T length ( const Vec2<T>& a ) { return sqrt(dot(a,a)); }
+ template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a ) { return a*rsqrt(dot(a,a)); }
+ template<typename T> __forceinline T distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+ template<typename T> __forceinline T det ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+ template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+ const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+ return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+ }
+
+ template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+ return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+ }
+
+ template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+ return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+ }
+
+ template<typename T>
+ __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+ return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+ }
+
+ template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+ {
+ const Vec2<T> b = abs(a);
+ if (b.x > b.y) return 0;
+ else return 1;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+ return cout << "(" << a.x << ", " << a.y << ")";
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Default template instantiations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ typedef Vec2<bool > Vec2b;
+ typedef Vec2<int > Vec2i;
+ typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+ template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__)
+ template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+ template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+ template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
new file mode 100644
index 0000000000..a51fb68fd0
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -0,0 +1,301 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE Vec2fa Type
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(16) Vec2fa
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef float Scalar;
+ enum { N = 2 };
+ union {
+ __m128 m128;
+ struct { float x,y,az,aw; };
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa( ) {}
+ __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+ __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; }
+ __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+ __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; }
+ __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+ __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+ __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+ __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+ __forceinline operator const __m128&() const { return m128; }
+ __forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline Vec2fa load( const void* const a ) {
+ return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+ }
+
+ static __forceinline Vec2fa loadu( const void* const a ) {
+ return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+ }
+
+ static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+ _mm_storeu_ps((float*)ptr,v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
+ __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
+ __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+ __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+ __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+ __forceinline Vec2fa operator -( const Vec2fa& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return _mm_xor_ps(a.m128, mask);
+ }
+ __forceinline Vec2fa abs ( const Vec2fa& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return _mm_and_ps(a.m128, mask);
+ }
+ __forceinline Vec2fa sign ( const Vec2fa& a ) {
+ return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+ }
+
+ __forceinline Vec2fa rcp ( const Vec2fa& a )
+ {
+#if defined(__AVX512VL__)
+ const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+ const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+ const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+ const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+ //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+ return res;
+ }
+
+ __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+ __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+ __forceinline Vec2fa rsqrt( const Vec2fa& a )
+ {
+#if defined(__AVX512VL__)
+ __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+ __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+ return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+ }
+
+ __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+ return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+ }
+ __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+ return rcp(zero_fix(a));
+ }
+ __forceinline Vec2fa log ( const Vec2fa& a ) {
+ return Vec2fa(logf(a.x),logf(a.y));
+ }
+
+ __forceinline Vec2fa exp ( const Vec2fa& a ) {
+ return Vec2fa(expf(a.x),expf(a.y));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+ __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+ __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+ __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+ __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+ __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+ __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+ __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+ __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+ __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_min_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_max_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+ __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+ return Vec2fa(powf(a.x,b),powf(a.y,b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+ __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+ __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+ __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+ __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+ __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+ __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+ __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+ __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+ __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+ __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+ __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+ __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+ __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+ __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+ __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; }
+ __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+ __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+ __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+ __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+ __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+ __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+ __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+ return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+ }
+#else
+ __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+ return reduce_add(a*b);
+ }
+#endif
+
+ __forceinline Vec2fa cross ( const Vec2fa& a ) {
+ return Vec2fa(-a.y,a.x);
+ }
+
+ __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); }
+ __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); }
+ __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); }
+ __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); }
+ __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); }
+ __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+ __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+ return blendv_ps(f, t, mask);
+ }
+
+ __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+ return madd(1.0f-t,v0,t*v1);
+ }
+
+ __forceinline int maxDim ( const Vec2fa& a )
+ {
+ const Vec2fa b = abs(a);
+ if (b.x > b.y) return 0;
+ else return 1;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+ //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+ __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+ __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+#elif defined (__SSE4_1__)
+ //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+ __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+ __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+#else
+ //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+ __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+ __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+ return cout << "(" << a.x << ", " << a.y << ")";
+ }
+
+ typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
new file mode 100644
index 0000000000..ce94eff327
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3.h
@@ -0,0 +1,337 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+ struct Vec3fa;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Generic 3D vector Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct Vec3
+ {
+ enum { N = 3 };
+
+ union {
+ struct {
+ T x, y, z;
+ };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+ T components[N];
+#endif
+ };
+
+ typedef T Scalar;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3( ) {}
+ __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {}
+ __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+ __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+ __forceinline Vec3( const Vec3fa& other );
+
+ template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+ template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+ __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {}
+ __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {}
+ __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+ __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+ __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+ __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; }
+#else
+ __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+ __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; }
+#endif
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+ template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+ template<typename T> __forceinline Vec3<T> abs ( const Vec3<T>& a ) { return Vec3<T>(abs (a.x), abs (a.y), abs (a.z)); }
+ template<typename T> __forceinline Vec3<T> rcp ( const Vec3<T>& a ) { return Vec3<T>(rcp (a.x), rcp (a.y), rcp (a.z)); }
+ template<typename T> __forceinline Vec3<T> rsqrt ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+ template<typename T> __forceinline Vec3<T> sqrt ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+ template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+ {
+ return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+ select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+ select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+ }
+ template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+ template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+ template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+ template<typename T> __forceinline Vec3<T> operator *( const T& a, const Vec3<T>& b ) { return Vec3<T>(a * b.x, a * b.y, a * b.z); }
+ template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x * b , a.y * b , a.z * b ); }
+ template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x / b , a.y / b , a.z / b ); }
+ template<typename T> __forceinline Vec3<T> operator /( const T& a, const Vec3<T>& b ) { return Vec3<T>(a / b.x, a / b.y, a / b.z); }
+ template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+ template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+ template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+ template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+ template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T> madd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+ template<typename T> __forceinline Vec3<T> msub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+ template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+ template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+ template<typename T> __forceinline Vec3<T> madd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+ template<typename T> __forceinline Vec3<T> msub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+ template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+ template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; }
+ template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+ template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+ template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; }
+ template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+ template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+ template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+ template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+ template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+ template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+ if (a.x != b.x) return a.x < b.x;
+ if (a.y != b.y) return a.y < b.y;
+ if (a.z != b.z) return a.z < b.z;
+ return false;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Shift Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+ return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+ return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+ }
+
+ template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+ return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+ }
+
+ template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+ return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+ }
+
+ template<typename T>
+ __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+ return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+ }
+
+ template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+ {
+ const Vec3<T> b = abs(a);
+ if (b.x > b.y) {
+ if (b.x > b.z) return 0; else return 2;
+ } else {
+ if (b.y > b.z) return 1; else return 2;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+ template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+ template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+ template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+ template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+ template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); }
+ template<typename T> __forceinline T dot ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+ template<typename T> __forceinline T length ( const Vec3<T>& a ) { return sqrt(sqr(a)); }
+ template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); }
+ template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); }
+ template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+ template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
+
+ template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+ {
+ const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+ const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+ const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+ const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+ const auto sx = abs(ab_x) < abs(bc_x);
+ const auto sy = abs(ab_y) < abs(bc_y);
+ const auto sz = abs(ab_z) < abs(bc_z);
+ return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+ select(sy,cross_ab.y,cross_bc.y),
+ select(sz,cross_ab.z,cross_bc.z));
+ }
+
+ template<typename T> __forceinline T sum ( const Vec3<T>& a ) { return a.x+a.y+a.z; }
+
+ template<typename T> __forceinline T halfArea ( const Vec3<T>& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+ template<typename T> __forceinline T area ( const Vec3<T>& d ) { return 2.0f*halfArea(d); }
+
+ template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+ const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) );
+ }
+
+ template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+ {
+ const Vec3<T> N = cross(P-Q0,Q1-Q0);
+ const Vec3<T> D = Q1-Q0;
+ return dot(N,N)*rcp(dot(D,D));
+ }
+
+ template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+ {
+ const Vec3<T> N = cross(PmQ0,Q1mQ0);
+ const Vec3<T> D = Q1mQ0;
+ return dot(N,N)*rcp(dot(D,D));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+ return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+ }
+
+ typedef Vec3<bool > Vec3b;
+ typedef Vec3<int > Vec3i;
+ typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+ template<typename Out, typename In>
+ __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+ return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+ }
+
+ template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+ template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+ x = a.x; y = a.y; z = a.z;
+ }
+#elif defined(__SSE__)
+ template<>
+ __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+ const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+ }
+#endif
+
+#if defined(__SSE__)
+ template<>
+ __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+ return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+ return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+ }
+#endif
+
+#if defined(__AVX__)
+ template<>
+ __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+ x = a.x; y = a.y; z = a.z;
+ }
+
+ template<>
+ __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+ return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+ }
+ template<>
+ __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+ return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+ return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+ }
+#endif
+
+#if defined(__AVX512F__)
+ template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h
new file mode 100644
index 0000000000..a021b522dc
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ba.h
@@ -0,0 +1,120 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE Vec3ba Type
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(16) Vec3ba
+ {
+ ALIGNED_STRUCT_(16);
+
+ union {
+ __m128 m128;
+ struct { int x,y,z; };
+ };
+
+ typedef int Scalar;
+ enum { N = 3 };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ba( ) {}
+ __forceinline Vec3ba( const __m128 input ) : m128(input) {}
+ __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+ __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+ __forceinline explicit Vec3ba( bool a )
+ : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+ __forceinline Vec3ba( bool a, bool b, bool c)
+ : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+ __forceinline operator const __m128&() const { return m128; }
+ __forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+ __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+ __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
+ };
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+ __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+ __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+ __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+ __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) {
+ return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7;
+ }
+ __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) {
+ return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7;
+ }
+ __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+ if (a.x != b.x) return a.x < b.x;
+ if (a.y != b.y) return a.y < b.y;
+ if (a.z != b.z) return a.z < b.z;
+ return false;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+ __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+ __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+ __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+ __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+ __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+ return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+ }
+}
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
new file mode 100644
index 0000000000..586039741d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -0,0 +1,727 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE Vec3fa Type
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(16) Vec3fa
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef float Scalar;
+ enum { N = 3 };
+ union {
+ __m128 m128;
+ struct { float x,y,z; };
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa( ) {}
+ __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+ __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
+ //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+ __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
+ __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+ __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+ __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+ __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+ __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+ __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+ __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
+ __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+
+ //__forceinline operator const __m128&() const { return m128; }
+ //__forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline Vec3fa load( const void* const a ) {
+ return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+ }
+
+ static __forceinline Vec3fa loadu( const void* const a ) {
+ return Vec3fa(_mm_loadu_ps((float*)a));
+ }
+
+ static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+ _mm_storeu_ps((float*)ptr,v.m128);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
+ __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
+ __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+ __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+ __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+ __forceinline Vec3fa operator -( const Vec3fa& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return _mm_xor_ps(a.m128, mask);
+ }
+ __forceinline Vec3fa abs ( const Vec3fa& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return _mm_and_ps(a.m128, mask);
+ }
+ __forceinline Vec3fa sign ( const Vec3fa& a ) {
+ return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+ }
+
+ __forceinline Vec3fa rcp ( const Vec3fa& a )
+ {
+#if defined(__AVX512VL__)
+ const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+ const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+ const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+ const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+ //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+ return res;
+ }
+
+ __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+ __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+ __forceinline Vec3fa rsqrt( const Vec3fa& a )
+ {
+#if defined(__AVX512VL__)
+ __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+ __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+ return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+ }
+
+ __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+ return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+ }
+ __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+ return rcp(zero_fix(a));
+ }
+ __forceinline Vec3fa log ( const Vec3fa& a ) {
+ return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+ }
+
+ __forceinline Vec3fa exp ( const Vec3fa& a ) {
+ return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+ __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+ __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+ __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+ __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+ __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+ __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+ __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+ __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+ __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+ const vint4 ai = _mm_castps_si128(a.m128);
+ const vint4 bi = _mm_castps_si128(b.m128);
+ const vint4 ci = _mm_min_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+ const vint4 ai = _mm_castps_si128(a.m128);
+ const vint4 bi = _mm_castps_si128(b.m128);
+ const vint4 ci = _mm_max_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+ __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+ return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+ __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+ __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+ __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+ __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+ __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+#endif
+
+ __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+ __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+ __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+ __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+ __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+ __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+ __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
+ __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+ __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float reduce_add(const Vec3fa& v) {
+ const vfloat4 a(v.m128);
+ const vfloat4 b = shuffle<1>(a);
+ const vfloat4 c = shuffle<2>(a);
+ return _mm_cvtss_f32(a+b+c);
+ }
+
+ __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+ __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+ __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+ __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+ __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+ __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+ __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+ __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+ __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+ __forceinline bool isvalid ( const Vec3fa& v ) {
+ return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite ( const Vec3fa& a ) {
+ return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+ }
+
+ __forceinline bool isvalid4 ( const Vec3fa& v ) {
+ return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite4 ( const Vec3fa& a ) {
+ return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+ __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+ return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+ }
+#else
+ __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+ return reduce_add(a*b);
+ }
+#endif
+
+ __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+ {
+ vfloat4 a0 = vfloat4(a.m128);
+ vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+ vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+ vfloat4 b1 = vfloat4(b.m128);
+ return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+ }
+
+ __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
+ __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }
+ __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }
+ __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }
+ __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }
+ __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+ __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+ __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
+
+ __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+ const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+ }
+
+ /*! differentiated normalization */
+ __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+ {
+ const float pp = dot(p,p);
+ const float pdp = dot(p,dp);
+ return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+ __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+ return blendv_ps(f.m128, t.m128, mask);
+ }
+
+ __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+ return blendv_ps(f.m128, t.m128, s);
+ }
+
+ __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+ return madd(1.0f-t,v0,t*v1);
+ }
+
+ __forceinline int maxDim ( const Vec3fa& a )
+ {
+ const Vec3fa b = abs(a);
+ if (b.x > b.y) {
+ if (b.x > b.z) return 0; else return 2;
+ } else {
+ if (b.y > b.z) return 1; else return 2;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE4_1__)
+ __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+ __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
+ __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
+#else
+ __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+ __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+ __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+ return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+ }
+
+ typedef Vec3fa Vec3fa_t;
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE Vec3fx Type
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(16) Vec3fx
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef float Scalar;
+ enum { N = 3 };
+ union {
+ __m128 m128;
+ struct { float x,y,z; union { int a; unsigned u; float w; }; };
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx( ) {}
+ __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+ __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+ __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+
+ __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
+ //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+ __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }
+
+ __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+ __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+ __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+ __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
+ __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+ __forceinline Vec3fx( const Vec3fa& other, const float w1) {
+#if defined (__SSE4_1__)
+ m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+ const vint4 mask(-1,-1,-1,0);
+ m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+ }
+ //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
+ //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+ __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+
+ //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+ __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+ __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+ __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
+ __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+
+ //__forceinline operator const __m128&() const { return m128; }
+ //__forceinline operator __m128&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline Vec3fx load( const void* const a ) {
+ return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+ }
+
+ static __forceinline Vec3fx loadu( const void* const a ) {
+ return Vec3fx(_mm_loadu_ps((float*)a));
+ }
+
+ static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+ _mm_storeu_ps((float*)ptr,v.m128);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}
+ __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
+ __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+ __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+ __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+ __forceinline Vec3fx operator -( const Vec3fx& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return _mm_xor_ps(a.m128, mask);
+ }
+ __forceinline Vec3fx abs ( const Vec3fx& a ) {
+ const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return _mm_and_ps(a.m128, mask);
+ }
+ __forceinline Vec3fx sign ( const Vec3fx& a ) {
+ return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+ }
+
+ __forceinline Vec3fx rcp ( const Vec3fx& a )
+ {
+#if defined(__AVX512VL__)
+ const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+ const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+ const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+ const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+ //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+ return res;
+ }
+
+ __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+ __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+ __forceinline Vec3fx rsqrt( const Vec3fx& a )
+ {
+#if defined(__AVX512VL__)
+ __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+ __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+ return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+ }
+
+ __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+ return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+ }
+ __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+ return rcp(zero_fix(a));
+ }
+ __forceinline Vec3fx log ( const Vec3fx& a ) {
+ return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+ }
+
+ __forceinline Vec3fx exp ( const Vec3fx& a ) {
+ return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+ __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+ __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+ __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+ __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+ __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+ __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+ __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+ __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+ __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+ const vint4 ai = _mm_castps_si128(a.m128);
+ const vint4 bi = _mm_castps_si128(b.m128);
+ const vint4 ci = _mm_min_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+ const vint4 ai = _mm_castps_si128(a.m128);
+ const vint4 bi = _mm_castps_si128(b.m128);
+ const vint4 ci = _mm_max_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#endif
+
+ __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+ return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+ __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+ __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+ __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+ __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+ __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+ __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+ __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+ __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+ __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+ __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+ __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+ __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+ __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }
+ __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+ __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float reduce_add(const Vec3fx& v) {
+ const vfloat4 a(v.m128);
+ const vfloat4 b = shuffle<1>(a);
+ const vfloat4 c = shuffle<2>(a);
+ return _mm_cvtss_f32(a+b+c);
+ }
+
+ __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+ __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+ __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+ __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+ __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+ __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+ __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+ __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+ __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+ __forceinline bool isvalid ( const Vec3fx& v ) {
+ return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite ( const Vec3fx& a ) {
+ return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+ }
+
+ __forceinline bool isvalid4 ( const Vec3fx& v ) {
+ return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite4 ( const Vec3fx& a ) {
+ return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+ __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+ return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+ }
+#else
+ __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+ return reduce_add(a*b);
+ }
+#endif
+
+ __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+ {
+ vfloat4 a0 = vfloat4(a.m128);
+ vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+ vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+ vfloat4 b1 = vfloat4(b.m128);
+ return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+ }
+
+ __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); }
+ __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); }
+ __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); }
+ __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); }
+ __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); }
+ __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+ __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+ __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }
+
+ __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+ const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+ }
+
+ /*! differentiated normalization */
+ __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+ {
+ const float pp = dot(p,p);
+ const float pdp = dot(p,dp);
+ return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+ __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+ return blendv_ps(f.m128, t.m128, mask);
+ }
+
+ __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+ return blendv_ps(f.m128, t.m128, s);
+ }
+
+ __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+ return madd(1.0f-t,v0,t*v1);
+ }
+
+ __forceinline int maxDim ( const Vec3fx& a )
+ {
+ const Vec3fx b = abs(a);
+ if (b.x > b.y) {
+ if (b.x > b.z) return 0; else return 2;
+ } else {
+ if (b.y > b.z) return 1; else return 2;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+ __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
+ __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
+ __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+ __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+ __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
+ __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
+#else
+ __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+ __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+ __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+ return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+ }
+
+
+ typedef Vec3fx Vec3ff;
+}
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
new file mode 100644
index 0000000000..694804c40d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// SSE Vec3ia Type
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(16) Vec3ia
+ {
+ ALIGNED_STRUCT_(16);
+
+ union {
+ __m128i m128;
+ struct { int x,y,z; };
+ };
+
+ typedef int Scalar;
+ enum { N = 3 };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia( ) {}
+ __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+ __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+ __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+ __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+ __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+ __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+ __forceinline operator const __m128i&() const { return m128; }
+ __forceinline operator __m128i&() { return m128; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {}
+ __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {}
+ __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+ __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+ __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
+ };
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+ __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if defined(__SSSE3__)
+ __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+ __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); }
+ __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+ __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+ __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); }
+ __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+ __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); }
+ __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+ __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+ __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); }
+ __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+ __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+ __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); }
+ __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+ __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+ __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); }
+ __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+ __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+ __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+ __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+ __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+ __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+ __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; }
+
+ __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+ __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+ __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; }
+#endif
+
+ __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+ __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; }
+
+ __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+ __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; }
+
+ __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+ __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+ __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+ __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+ __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+ __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+ __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+ if (a.x != b.x) return a.x < b.x;
+ if (a.y != b.y) return a.y < b.y;
+ if (a.z != b.z) return a.z < b.z;
+ return false;
+ }
+
+ __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+ __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+ __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+ return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
+#endif
+ }
+
+#if defined(__SSE4_1__)
+ __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+ __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+ __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+ __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+ return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+ }
+}
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
new file mode 100644
index 0000000000..0ed107928a
--- /dev/null
+++ b/thirdparty/embree/common/math/vec4.h
@@ -0,0 +1,243 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "vec3.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Generic 4D vector Class
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> struct Vec4
+ {
+ enum { N = 4 };
+ union {
+ struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+ T components[N];
+#endif
+ };
+
+ typedef T Scalar;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Construction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec4( ) {}
+ __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {}
+ __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+ __forceinline Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+ __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+ __forceinline Vec4( const Vec3fx& other );
+
+ template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+ template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+ __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+ __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {}
+ __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {}
+ __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+ __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+ __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+ __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; }
+#else
+ __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+ __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Swizzles
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+ template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+ template<typename T> __forceinline Vec4<T> abs ( const Vec4<T>& a ) { return Vec4<T>(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); }
+ template<typename T> __forceinline Vec4<T> rcp ( const Vec4<T>& a ) { return Vec4<T>(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); }
+ template<typename T> __forceinline Vec4<T> rsqrt ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+ template<typename T> __forceinline Vec4<T> sqrt ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+ template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+ template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+ template<typename T> __forceinline Vec4<T> operator *( const T& a, const Vec4<T>& b ) { return Vec4<T>(a * b.x, a * b.y, a * b.z, a * b.w); }
+ template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x * b , a.y * b , a.z * b , a.w * b ); }
+ template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+ template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x / b , a.y / b , a.z / b , a.w / b ); }
+ template<typename T> __forceinline Vec4<T> operator /( const T& a, const Vec4<T>& b ) { return Vec4<T>(a / b.x, a / b.y, a / b.z, a / b.w); }
+
+ template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+ template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T> madd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> msub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+ template<typename T> __forceinline Vec4<T> madd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> msub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+ template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+ template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+ template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; }
+ template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+ template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+ template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+ template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+ template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+ template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+ if (a.x != b.x) return a.x < b.x;
+ if (a.y != b.y) return a.y < b.y;
+ if (a.z != b.z) return a.z < b.z;
+ if (a.w != b.w) return a.w < b.w;
+ return false;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Shift Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+ return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+ template<typename T> __forceinline T length ( const Vec4<T>& a ) { return sqrt(dot(a,a)); }
+ template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a ) { return a*rsqrt(dot(a,a)); }
+ template<typename T> __forceinline T distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+ return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+ }
+
+ template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+ return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+ }
+
+ template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+ return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+ }
+
+ template<typename T>
+ __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+ return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+ return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Default template instantiations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ typedef Vec4<bool > Vec4b;
+ typedef Vec4<unsigned char> Vec4uc;
+ typedef Vec4<int > Vec4i;
+ typedef Vec4<float > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+ template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if defined(__AVX__)
+ template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+ x = a.x; y = a.y; z = a.z; w = a.w;
+ }
+#elif defined(__SSE__)
+ template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+ const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+ }
+#endif
+
+#if defined(__AVX__)
+ template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+ x = a.x; y = a.y; z = a.z; w = a.w;
+ }
+#endif
+
+#if defined(__AVX512F__)
+ template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+}
diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h
new file mode 100644
index 0000000000..1c3875fb27
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* Make precision match SSE, at the cost of some performance */
+#if !defined(__aarch64__)
+# define SSE2NEON_PRECISE_DIV 1
+# define SSE2NEON_PRECISE_SQRT 1
+#endif
+
+#include "sse2neon.h"
+
+__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
+ __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
+ return _mm_fmadd_ps(a, b, neg_c);
+}
+
+__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
+ vreinterpretq_f32_m128(b),
+ vreinterpretq_f32_m128(a)));
+#else
+ return _mm_sub_ps(c, _mm_mul_ps(a, b));
+#endif
+}
+
+__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
+ return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
+}
+
+
+/* Dummy defines for floating point control */
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_MASK_DENORM 0x100
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+
+__forceinline int _mm_getcsr()
+{
+ return 0;
+}
+
+__forceinline void _mm_mfence()
+{
+ __sync_synchronize();
+}
diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h
new file mode 100644
index 0000000000..7eb25cf2c5
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
@@ -0,0 +1,6996 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+// John W. Ratcliff <jratcliffscarab@gmail.com>
+// Brandon Rowlett <browlett@nvidia.com>
+// Ken Fast <kfast@gdeb.com>
+// Eric van Beurden <evanbeurden@nvidia.com>
+// Alexander Potylitsin <apotylitsin@nvidia.com>
+// Hasindu Gamaarachchi <hasindu2008@gmail.com>
+// Jim Huang <jserv@biilabs.io>
+// Mark Cheng <marktwtn@biilabs.io>
+// Malcolm James MacLeod <malcolm@gulden.com>
+// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+// Sebastian Pop <spop@amazon.com>
+// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+// Danila Kutenin <danilak@google.com>
+// François Turban (JishinMaster) <francois.turban@gmail.com>
+// Pei-Hsuan Hung <afcidk@gmail.com>
+// Yang-Hao Yuan <yanghau@biilabs.io>
+// Syoyo Fujita <syoyo@lighttransport.com>
+// Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min_ps and _mm_max_ps */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+#ifndef SSE2NEON_PRECISE_RSQRT
+#define SSE2NEON_PRECISE_RSQRT (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly. It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it. Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides. This union should really only be used when
+// trying to access the members of the vector as integer values. GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit. If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components. The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+ float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
+ int8_t m128_i8[16]; // as signed 8-bit integers.
+ int16_t m128_i16[8]; // as signed 16-bit integers.
+ int32_t m128_i32[4]; // as signed 32-bit integers.
+ int64_t m128_i64[2]; // as signed 64-bit integers.
+ uint8_t m128_u8[16]; // as unsigned 8-bit integers.
+ uint16_t m128_u16[8]; // as unsigned 16-bit integers.
+ uint32_t m128_u32[4]; // as unsigned 32-bit integers.
+ uint64_t m128_u64[2]; // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) && \
+ ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+ (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \
+ (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+ uint8x16x4_t ret;
+ ret.val[0] = vld1q_u8(p + 0);
+ ret.val[1] = vld1q_u8(p + 16);
+ ret.val[2] = vld1q_u8(p + 32);
+ ret.val[3] = vld1q_u8(p + 48);
+ return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+ return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ * _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ * signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ * unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ * than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ * // Set packed 8-bit integers
+ * // 128 bits, 16 chars, per 8 bits
+ * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
+ * 4, 5, 12, 13, 6, 7, 14, 15);
+ * // Shuffle packed 8-bit integers
+ * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+ +------+------+-------------+------+------+-------------+
+ | 1 | 2 | 3 | 4 | Number
+ +------+------+------+------+------+------+------+------+
+ | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+ +------+------+------+------+------+------+------+------+
+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
+ +------+------+------+------+------+------+------+------+
+
+ +------+------+------+------+------+------+------+------+
+ | 5 | 6 | 7 | 8 | Number
+ +------+------+------+------+------+------+------+------+
+ | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+ +------+------+------+------+------+------+------+------+
+ | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
+ +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+ +------+------+------+------+------+------+------+------+
+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
+ +------+------+------+------+------+------+------+------+
+
+ +------+------+------+------+------+------+------+------+
+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
+ +------+------+------+------+------+------+------+------+
+ * Result:
+ +------+------+------+------+------+------+------+------+
+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
+ +------+------+------+------+------+------+------+------+
+ | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+ +------+------+------+------+------+------+------+------+
+ | 256 | 2 | 5 | 6 | Number
+ +------+------+------+------+------+------+------+------+
+
+ +------+------+------+------+------+------+------+------+
+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
+ +------+------+------+------+------+------+------+------+
+ | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+ +------+------+------+------+------+------+------+------+
+ | 3 | 7 | 4 | 8 | Number
+ +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+ _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
+ _MM_HINT_T1 = 2, /* load data to L2 cache only */
+ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
+ _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+ _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
+ _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
+ _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+ (void) i;
+ __builtin_prefetch(p);
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+ __asm__ __volatile__("isb\n");
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+// dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+ return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+// dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+// dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+ double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+ return vreinterpretq_m128d_s64(
+ vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__)
+ return vgetq_lane_s64(
+ vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
+#else
+ float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ float32_t diff = data - floor(data);
+ if (diff > 0.5)
+ return (int64_t) ceil(data);
+ if (unlikely(diff == 0.5)) {
+ int64_t f = (int64_t) floor(data);
+ int64_t c = (int64_t) ceil(data);
+ return c & 1 ? f : c;
+ }
+ return (int64_t) floor(data);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := 32*j
+// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+ return vreinterpret_m64_s32(
+ vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+ return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := 32*j
+// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+ return vgetq_lane_s64(
+ vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+ return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+ return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+ return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+// r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+ return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+ return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+ float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+ return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+ float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+ return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+ float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+ return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+// r0 := w0
+// r1 := w1
+// ...
+// r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+ short w1,
+ short w2,
+ short w3,
+ short w4,
+ short w5,
+ short w6,
+ short w7)
+{
+ int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+ return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+ int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+ return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+ return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+// r0 := b
+// r1 := b
+// ...
+// r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+ return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+ return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+// r0 := w
+// r1 := w
+// ...
+// r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+ return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+ signed char b14,
+ signed char b13,
+ signed char b12,
+ signed char b11,
+ signed char b10,
+ signed char b9,
+ signed char b8,
+ signed char b7,
+ signed char b6,
+ signed char b5,
+ signed char b4,
+ signed char b3,
+ signed char b2,
+ signed char b1,
+ signed char b0)
+{
+ int8_t ALIGN_STRUCT(16)
+ data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
+ (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
+ (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
+ (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+ return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+ short i6,
+ short i5,
+ short i4,
+ short i3,
+ short i2,
+ short i1,
+ short i0)
+{
+ int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+ return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+ signed char b1,
+ signed char b2,
+ signed char b3,
+ signed char b4,
+ signed char b5,
+ signed char b6,
+ signed char b7,
+ signed char b8,
+ signed char b9,
+ signed char b10,
+ signed char b11,
+ signed char b12,
+ signed char b13,
+ signed char b14,
+ signed char b15)
+{
+ int8_t ALIGN_STRUCT(16)
+ data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
+ (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
+ (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
+ (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+ return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+// r0 := i
+// r1 := i
+// r2 := i
+// r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+ return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+ return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+ return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+ int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+ return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+ return vreinterpretq_m128i_s64(
+ vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+ return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+ double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+ return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+ return _mm_set_pd(e0, e1);
+}
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+ return _mm_set_pd(0, a);
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+ vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+// MEM[mem_addr+31:mem_addr] := a[31:0]
+// MEM[mem_addr+63:mem_addr+32] := a[31:0]
+// MEM[mem_addr+95:mem_addr+64] := a[31:0]
+// MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+ float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+// MEM[mem_addr+31:mem_addr] := a[31:0]
+// MEM[mem_addr+63:mem_addr+32] := a[31:0]
+// MEM[mem_addr+95:mem_addr+64] := a[31:0]
+// MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+// MEM[mem_addr+31:mem_addr] := a[127:96]
+// MEM[mem_addr+63:mem_addr+32] := a[95:64]
+// MEM[mem_addr+95:mem_addr+64] := a[63:32]
+// MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+ float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+ float32x4_t rev = vextq_f32(tmp, tmp, 2);
+ vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+ vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+ vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+ vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+ vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+ vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+ vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+// MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+ vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+ vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+// MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+ vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+ vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+// MEM[mem_addr+63:mem_addr] := a[127:64]
+// MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+ float32x4_t f = vreinterpretq_f32_m128d(a);
+ _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+ float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+ vst1q_f64((float64_t *) mem_addr,
+ vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+ float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+ vst1q_f32((float32_t *) mem_addr,
+ vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+ _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+ uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+ uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+ *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+// *p0 := a0
+// *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+ *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+// *p0 := a2
+// *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+ *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+ return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+// dst[31:0] := MEM[mem_addr+31:mem_addr]
+// dst[63:32] := MEM[mem_addr+31:mem_addr]
+// dst[95:64] := MEM[mem_addr+31:mem_addr]
+// dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+// r0 := *p0
+// r1 := *p1
+// r2 := a2
+// r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+ return vreinterpretq_m128_f32(
+ vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+// dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+ float32x4_t v = vrev64q_f32(vld1q_f32(p));
+ return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+// r0 := a0
+// r1 := a1
+// r2 := *p0
+// r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+ return vreinterpretq_m128_f32(
+ vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+ return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+ // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+ // equivalent for neon
+ return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+// dst[15:0] := MEM[mem_addr+15:mem_addr]
+// dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+ return vreinterpretq_m128i_s16(
+ vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+ return vreinterpretq_m128i_s64(
+ vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+ const float *fp = (const float *) p;
+ float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+ return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+// dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+ const float *fp = (const float *) p;
+ float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+ return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+ return _mm_load_pd(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+ return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+ /* Load the lower 64 bits of the value pointed to by p into the
+ * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+ */
+ return vreinterpretq_m128i_s32(
+ vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+ return vreinterpretq_m128d_f32(
+ vcombine_f32(vld1_f32((const float *) p),
+ vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+ float64x2_t v = vld1q_f64(p);
+ return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+ int64x2_t v = vld1q_s64((const int64_t *) p);
+ return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+ vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+// dst[63:0] := b[63:0]
+// dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+ return vreinterpretq_m128d_f32(
+ vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+ vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+// dst[63:0] := a[63:0]
+// dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+ return vreinterpretq_m128i_s64(
+ vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+ __m128 a;
+ return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+// r0 := ~a0 & b0
+// r1 := ~a1 & b1
+// r2 := ~a2 & b2
+// r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_s32(
+ vbicq_s32(vreinterpretq_s32_m128(b),
+ vreinterpretq_s32_m128(a))); // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := j*64
+// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+ // *NOTE* argument swap
+ return vreinterpretq_m128d_s64(
+ vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+// r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vbicq_s32(vreinterpretq_s32_m128i(b),
+ vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+// r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+// r0 := a0 & b0
+// r1 := a1 & b1
+// r2 := a2 & b2
+// r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_s32(
+ vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := j*64
+// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+ return vreinterpretq_m128d_s64(
+ vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_s32(
+ vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_s32(
+ veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := j*64
+// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+ return vreinterpretq_m128d_s64(
+ veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+ return vreinterpretq_m128d_s64(
+ vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+// r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if (__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+ return vreinterpretq_m128d_u64(
+ vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+ return vreinterpretq_m128_f32(__builtin_shufflevector(
+ vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+ float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+ float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+ float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+ return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+ return vreinterpretq_m128_f32(__builtin_shufflevector(
+ vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+ float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+ float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+ return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+// r3 := a3
+// r2 := a2
+// r1 := b3
+// r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+ return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+// r3 := b1
+// r2 := b0
+// r1 := a1
+// r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 3
+// i := j*32
+// dst[i+31:i] := ABS(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+ return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 7
+// i := j*16
+// dst[i+15:i] := ABS(a[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+ return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 15
+// i := j*8
+// dst[i+7:i] := ABS(a[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+ return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 1
+// i := j*32
+// dst[i+31:i] := ABS(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+ return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := ABS(a[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+ return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := ABS(a[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+ return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+// dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) >= 32)) { \
+ ret = _mm_setzero_si128(); \
+ } else { \
+ uint8x16_t tmp_low, tmp_high; \
+ if (imm >= 16) { \
+ const int idx = imm - 16; \
+ tmp_low = vreinterpretq_u8_m128i(a); \
+ tmp_high = vdupq_n_u8(0); \
+ ret = \
+ vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+ } else { \
+ const int idx = imm; \
+ tmp_low = vreinterpretq_u8_m128i(b); \
+ tmp_high = vreinterpretq_u8_m128i(a); \
+ ret = \
+ vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+ } \
+ } \
+ ret; \
+ })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+// dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm) \
+ __extension__({ \
+ __m64 ret; \
+ if (unlikely((imm) >= 16)) { \
+ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
+ } else { \
+ uint8x8_t tmp_low, tmp_high; \
+ if (imm >= 8) { \
+ const int idx = imm - 8; \
+ tmp_low = vreinterpret_u8_m64(a); \
+ tmp_high = vdup_n_u8(0); \
+ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+ } else { \
+ const int idx = imm; \
+ tmp_low = vreinterpret_u8_m64(b); \
+ tmp_high = vreinterpret_u8_m64(a); \
+ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+ } \
+ } \
+ ret; \
+ })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+ float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+ float32x2_t a21 = vget_high_f32(
+ vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+ float32x2_t b03 = vget_low_f32(
+ vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+ return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+ float32x2_t a03 = vget_low_f32(
+ vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+ float32x2_t b21 = vget_high_f32(
+ vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+ return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+ float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+ float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+ return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+ float32x2_t a22 =
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+ return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+ float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+ float32x2_t b22 =
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+ return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+ float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ float32x2_t a22 =
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+ float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+ float32x2_t a33 =
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+ float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+ return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+ float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+ float32_t b2 = vgetq_lane_f32(b, 2);
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+ float32_t b2 = vgetq_lane_f32(b, 2);
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+// __constrange(0, 255) int imm) {
+// __m128 ret;
+// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
+// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
+// return ret;
+// }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm) \
+ __extension__({ \
+ float32x4_t ret; \
+ ret = vmovq_n_f32( \
+ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
+ ret = vsetq_lane_f32( \
+ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+ ret, 1); \
+ ret = vsetq_lane_f32( \
+ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+ ret, 2); \
+ ret = vsetq_lane_f32( \
+ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+ ret, 3); \
+ vreinterpretq_m128_f32(ret); \
+ })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm) \
+ __extension__({ \
+ float32x4_t _input1 = vreinterpretq_f32_m128(a); \
+ float32x4_t _input2 = vreinterpretq_f32_m128(b); \
+ float32x4_t _shuf = __builtin_shufflevector( \
+ _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+ (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+ vreinterpretq_m128_f32(_shuf); \
+ })
+#else // generic
+#define _mm_shuffle_ps(a, b, imm) \
+ __extension__({ \
+ __m128 ret; \
+ switch (imm) { \
+ case _MM_SHUFFLE(1, 0, 3, 2): \
+ ret = _mm_shuffle_ps_1032((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 3, 0, 1): \
+ ret = _mm_shuffle_ps_2301((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(0, 3, 2, 1): \
+ ret = _mm_shuffle_ps_0321((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 1, 0, 3): \
+ ret = _mm_shuffle_ps_2103((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(1, 0, 1, 0): \
+ ret = _mm_movelh_ps((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(1, 0, 0, 1): \
+ ret = _mm_shuffle_ps_1001((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(0, 1, 0, 1): \
+ ret = _mm_shuffle_ps_0101((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(3, 2, 1, 0): \
+ ret = _mm_shuffle_ps_3210((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(0, 0, 1, 1): \
+ ret = _mm_shuffle_ps_0011((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(0, 0, 2, 2): \
+ ret = _mm_shuffle_ps_0022((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 2, 0, 0): \
+ ret = _mm_shuffle_ps_2200((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(3, 2, 0, 2): \
+ ret = _mm_shuffle_ps_3202((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(3, 2, 3, 2): \
+ ret = _mm_movehl_ps((b), (a)); \
+ break; \
+ case _MM_SHUFFLE(1, 1, 3, 3): \
+ ret = _mm_shuffle_ps_1133((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 0, 1, 0): \
+ ret = _mm_shuffle_ps_2010((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 0, 0, 1): \
+ ret = _mm_shuffle_ps_2001((a), (b)); \
+ break; \
+ case _MM_SHUFFLE(2, 0, 3, 2): \
+ ret = _mm_shuffle_ps_2032((a), (b)); \
+ break; \
+ default: \
+ ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+ break; \
+ } \
+ ret; \
+ })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+ int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+ return vreinterpretq_m128i_s32(
+ vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+ return vreinterpretq_m128i_s32(
+ vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+ return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+ int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+ return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+ return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+ int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+ int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
+ uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
+ uint8x16_t idx_masked =
+ vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+ int8x16_t ret;
+ // %e and %f represent the even and odd D registers
+ // respectively.
+ __asm__ __volatile__(
+ "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+ "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+ : [ret] "=&w"(ret)
+ : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+ return vreinterpretq_m128i_s8(ret);
+#else
+ // use this line if testing on aarch64
+ int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+ return vreinterpretq_m128i_s8(
+ vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+ vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+// __m128i _mm_shuffle_epi32_default(__m128i a,
+// __constrange(0, 255) int imm) {
+// __m128i ret;
+// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
+// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
+// return ret;
+// }
+#define _mm_shuffle_epi32_default(a, imm) \
+ __extension__({ \
+ int32x4_t ret; \
+ ret = vmovq_n_s32( \
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
+ ret = vsetq_lane_s32( \
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+ ret, 1); \
+ ret = vsetq_lane_s32( \
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+ ret, 2); \
+ ret = vsetq_lane_s32( \
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+ ret, 3); \
+ vreinterpretq_m128i_s32(ret); \
+ })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s32( \
+ vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+ })
+#else
+#define _mm_shuffle_epi32_splat(a, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s32( \
+ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+ })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+// __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm) \
+ __extension__({ \
+ int32x4_t _input = vreinterpretq_s32_m128i(a); \
+ int32x4_t _shuf = __builtin_shufflevector( \
+ _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
+ vreinterpretq_m128i_s32(_shuf); \
+ })
+#else // generic
+#define _mm_shuffle_epi32(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ switch (imm) { \
+ case _MM_SHUFFLE(1, 0, 3, 2): \
+ ret = _mm_shuffle_epi_1032((a)); \
+ break; \
+ case _MM_SHUFFLE(2, 3, 0, 1): \
+ ret = _mm_shuffle_epi_2301((a)); \
+ break; \
+ case _MM_SHUFFLE(0, 3, 2, 1): \
+ ret = _mm_shuffle_epi_0321((a)); \
+ break; \
+ case _MM_SHUFFLE(2, 1, 0, 3): \
+ ret = _mm_shuffle_epi_2103((a)); \
+ break; \
+ case _MM_SHUFFLE(1, 0, 1, 0): \
+ ret = _mm_shuffle_epi_1010((a)); \
+ break; \
+ case _MM_SHUFFLE(1, 0, 0, 1): \
+ ret = _mm_shuffle_epi_1001((a)); \
+ break; \
+ case _MM_SHUFFLE(0, 1, 0, 1): \
+ ret = _mm_shuffle_epi_0101((a)); \
+ break; \
+ case _MM_SHUFFLE(2, 2, 1, 1): \
+ ret = _mm_shuffle_epi_2211((a)); \
+ break; \
+ case _MM_SHUFFLE(0, 1, 2, 2): \
+ ret = _mm_shuffle_epi_0122((a)); \
+ break; \
+ case _MM_SHUFFLE(3, 3, 3, 2): \
+ ret = _mm_shuffle_epi_3332((a)); \
+ break; \
+ case _MM_SHUFFLE(0, 0, 0, 0): \
+ ret = _mm_shuffle_epi32_splat((a), 0); \
+ break; \
+ case _MM_SHUFFLE(1, 1, 1, 1): \
+ ret = _mm_shuffle_epi32_splat((a), 1); \
+ break; \
+ case _MM_SHUFFLE(2, 2, 2, 2): \
+ ret = _mm_shuffle_epi32_splat((a), 2); \
+ break; \
+ case _MM_SHUFFLE(3, 3, 3, 3): \
+ ret = _mm_shuffle_epi32_splat((a), 3); \
+ break; \
+ default: \
+ ret = _mm_shuffle_epi32_default((a), (imm)); \
+ break; \
+ } \
+ ret; \
+ })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+// __constrange(0,255) int
+// imm)
+#define _mm_shufflelo_epi16_function(a, imm) \
+ __extension__({ \
+ int16x8_t ret = vreinterpretq_s16_m128i(a); \
+ int16x4_t lowBits = vget_low_s16(ret); \
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+ 1); \
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+ 2); \
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+ 3); \
+ vreinterpretq_m128i_s16(ret); \
+ })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+// __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm) \
+ __extension__({ \
+ int16x8_t _input = vreinterpretq_s16_m128i(a); \
+ int16x8_t _shuf = __builtin_shufflevector( \
+ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
+ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+ vreinterpretq_m128i_s16(_shuf); \
+ })
+#else // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+// __constrange(0,255) int
+// imm)
+#define _mm_shufflehi_epi16_function(a, imm) \
+ __extension__({ \
+ int16x8_t ret = vreinterpretq_s16_m128i(a); \
+ int16x4_t highBits = vget_high_s16(ret); \
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+ 5); \
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+ 6); \
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+ 7); \
+ vreinterpretq_m128i_s16(ret); \
+ })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+// __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm) \
+ __extension__({ \
+ int16x8_t _input = vreinterpretq_s16_m128i(a); \
+ int16x8_t _shuf = __builtin_shufflevector( \
+ _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
+ (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+ (((imm) >> 6) & 0x3) + 4); \
+ vreinterpretq_m128i_s16(_shuf); \
+ })
+#else // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8) \
+ vreinterpretq_m128d_s64(__builtin_shufflevector( \
+ vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+ ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8) \
+ _mm_castsi128_pd(_mm_set_epi64x( \
+ vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+ vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+// FOR j := 0 to 7
+// i := j*16
+// IF imm8[j]
+// dst[i+15:i] := b[i+15:i]
+// ELSE
+// dst[i+15:i] := a[i+15:i]
+// FI
+// ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+// __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm) \
+ __extension__({ \
+ const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
+ ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
+ uint16x8_t _mask_vec = vld1q_u16(_mask); \
+ uint16x8_t _a = vreinterpretq_u16_m128i(a); \
+ uint16x8_t _b = vreinterpretq_u16_m128i(b); \
+ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
+ })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+// FOR j := 0 to 15
+// i := j*8
+// IF mask[i+7]
+// dst[i+7:i] := b[i+7:i]
+// ELSE
+// dst[i+7:i] := a[i+7:i]
+// FI
+// ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+ // Use a signed shift right to create a mask with the sign bit
+ uint8x16_t mask =
+ vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+ uint8x16_t a = vreinterpretq_u8_m128i(_a);
+ uint8x16_t b = vreinterpretq_u8_m128i(_b);
+ return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+ const int count = (imm & ~15) ? 15 : imm;
+ return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// ...
+// r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm)) <= 0) { \
+ ret = a; \
+ } \
+ if (unlikely((imm) > 15)) { \
+ ret = _mm_setzero_si128(); \
+ } else { \
+ ret = vreinterpretq_m128i_s16( \
+ vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+ } \
+ ret; \
+ })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+ if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+ return a;
+ if (unlikely(imm > 31))
+ return _mm_setzero_si128();
+ return vreinterpretq_m128i_s32(
+ vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+ if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+ return a;
+ if (unlikely(imm > 63))
+ return _mm_setzero_si128();
+ return vreinterpretq_m128i_s64(
+ vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+// FOR j := 0 to 7
+// i := j*16
+// IF imm8[7:0] > 15
+// dst[i+15:i] := 0
+// ELSE
+// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely(imm) == 0) { \
+ ret = a; \
+ } \
+ if (likely(0 < (imm) && (imm) < 16)) { \
+ ret = vreinterpretq_m128i_u16( \
+ vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+ } else { \
+ ret = _mm_setzero_si128(); \
+ } \
+ ret; \
+ })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+// FOR j := 0 to 3
+// i := j*32
+// IF imm8[7:0] > 31
+// dst[i+31:i] := 0
+// ELSE
+// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) == 0)) { \
+ ret = a; \
+ } \
+ if (likely(0 < (imm) && (imm) < 32)) { \
+ ret = vreinterpretq_m128i_u32( \
+ vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+ } else { \
+ ret = _mm_setzero_si128(); \
+ } \
+ ret; \
+ })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+// FOR j := 0 to 1
+// i := j*64
+// IF imm8[7:0] > 63
+// dst[i+63:i] := 0
+// ELSE
+// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) == 0)) { \
+ ret = a; \
+ } \
+ if (likely(0 < (imm) && (imm) < 64)) { \
+ ret = vreinterpretq_m128i_u64( \
+ vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+ } else { \
+ ret = _mm_setzero_si128(); \
+ } \
+ ret; \
+ })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+// FOR j := 0 to 3
+// i := j*32
+// IF imm8[7:0] > 31
+// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+// ELSE
+// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) == 0)) { \
+ ret = a; \
+ } \
+ if (likely(0 < (imm) && (imm) < 32)) { \
+ ret = vreinterpretq_m128i_s32( \
+ vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+ } else { \
+ ret = vreinterpretq_m128i_s32( \
+ vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
+ } \
+ ret; \
+ })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+// r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) <= 0)) { \
+ ret = a; \
+ } \
+ if (unlikely((imm) > 15)) { \
+ ret = _mm_setzero_si128(); \
+ } else { \
+ ret = vreinterpretq_m128i_s8( \
+ vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+ } \
+ ret; \
+ })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+// r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm) \
+ __extension__({ \
+ __m128i ret; \
+ if (unlikely((imm) <= 0)) { \
+ ret = a; \
+ } \
+ if (unlikely((imm) > 15)) { \
+ ret = _mm_setzero_si128(); \
+ } else { \
+ ret = vreinterpretq_m128i_s8(vextq_s8( \
+ vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+ } \
+ ret; \
+ })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// ...
+// r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 15))
+ return _mm_setzero_si128();
+
+ int16x8_t vc = vdupq_n_s16((int16_t) c);
+ return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 31))
+ return _mm_setzero_si128();
+
+ int32x4_t vc = vdupq_n_s32((int32_t) c);
+ return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 63))
+ return _mm_setzero_si128();
+
+ int64x2_t vc = vdupq_n_s64((int64_t) c);
+ return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 15))
+ return _mm_setzero_si128();
+
+ int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+ return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 31))
+ return _mm_setzero_si128();
+
+ int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+ return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+ uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+ if (unlikely(c > 63))
+ return _mm_setzero_si128();
+
+ int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+ return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+ // Use increasingly wide shifts+adds to collect the sign bits
+ // together.
+ // Since the widening shifts would be rather confusing to follow in little
+ // endian, everything will be illustrated in big endian order instead. This
+ // has a different result - the bits would actually be reversed on a big
+ // endian machine.
+
+ // Starting input (only half the elements are shown):
+ // 89 ff 1d c0 00 10 99 33
+ uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+ // Shift out everything but the sign bits with an unsigned shift right.
+ //
+ // Bytes of the vector::
+ // 89 ff 1d c0 00 10 99 33
+ // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
+ // | | | | | | | |
+ // 01 01 00 01 00 00 01 00
+ //
+ // Bits of first important lane(s):
+ // 10001001 (89)
+ // \______
+ // |
+ // 00000001 (01)
+ uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+ // Merge the even lanes together with a 16-bit unsigned shift right + add.
+ // 'xx' represents garbage data which will be ignored in the final result.
+ // In the important bytes, the add functions like a binary OR.
+ //
+ // 01 01 00 01 00 00 01 00
+ // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
+ // \| \| \| \|
+ // xx 03 xx 01 xx 00 xx 02
+ //
+ // 00000001 00000001 (01 01)
+ // \_______ |
+ // \|
+ // xxxxxxxx xxxxxx11 (xx 03)
+ uint32x4_t paired16 =
+ vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+ // Repeat with a wider 32-bit shift + add.
+ // xx 03 xx 01 xx 00 xx 02
+ // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
+ // 14))
+ // \| \|
+ // xx xx xx 0d xx xx xx 02
+ //
+ // 00000011 00000001 (03 01)
+ // \\_____ ||
+ // '----.\||
+ // xxxxxxxx xxxx1101 (xx 0d)
+ uint64x2_t paired32 =
+ vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+ // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+ // lanes. xx xx xx 0d xx xx xx 02
+ // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
+ // 28))
+ // \|
+ // xx xx xx xx xx xx xx d2
+ //
+ // 00001101 00000010 (0d 02)
+ // \ \___ | |
+ // '---. \| |
+ // xxxxxxxx 11010010 (xx d2)
+ uint8x16_t paired64 =
+ vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+ // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+ // xx xx xx xx xx xx xx d2
+ // || return paired64[0]
+ // d2
+ // Note: Little endian would return the correct value 4b (01001011) instead.
+ return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+// dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+ return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+// dst[63:0] := a[63:0]
+// dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+ return vreinterpretq_m128i_s64(
+ vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+ uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+ static const int32x4_t shift = {0, 1, 2, 3};
+ uint32x4_t tmp = vshrq_n_u32(input, 31);
+ return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+ // Uses the exact same method as _mm_movemask_epi8, see that for details.
+ // Shift out everything but the sign bits with a 32-bit unsigned shift
+ // right.
+ uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+ // Merge the two pairs together with a 64-bit unsigned shift right + add.
+ uint8x16_t paired =
+ vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+ // Extract the result.
+ return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+ return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+ ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+ int64x2_t a_and_mask =
+ vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+ return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+ : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+// r0 := a0 - b0
+// r1 := a1 - b1
+// r2 := a2 - b2
+// r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_f32(
+ vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+// dst[31:0] := a[31:0] - b[31:0]
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+// r0 := a0 - b0
+// r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s64(
+ vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+// r0 := a0 - b0
+// r1 := a1 - b1
+// r2 := a2 - b2
+// r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+// dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s64(
+ vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+// r0 := UnsignedSaturate(a0 - b0)
+// r1 := UnsignedSaturate(a1 - b1)
+// ...
+// r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+// r0 := SignedSaturate(a0 - b0)
+// r1 := SignedSaturate(a1 - b1)
+// ...
+// r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+// r0 := SignedSaturate(a0 - b0)
+// r1 := SignedSaturate(a1 - b1)
+// ...
+// r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+// i := j*64
+// dst[i+63:i] := a[i+63:i] - b[i+63:i]
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[2];
+ c[0] = da[0] - db[0];
+ c[1] = da[1] - db[1];
+ return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+ return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+// for i in 0..15
+// if b[i] < 0
+// r[i] := -a[i]
+// else if b[i] == 0
+// r[i] := 0
+// else
+// r[i] := a[i]
+// fi
+// done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+ int8x16_t a = vreinterpretq_s8_m128i(_a);
+ int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFF : 0
+ uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+ // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+ int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+ int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+ // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+ // based on ltMask
+ int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+ // res = masked & (~zeroMask)
+ int8x16_t res = vbicq_s8(masked, zeroMask);
+
+ return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+// for i in 0..7
+// if b[i] < 0
+// r[i] := -a[i]
+// else if b[i] == 0
+// r[i] := 0
+// else
+// r[i] := a[i]
+// fi
+// done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+ int16x8_t a = vreinterpretq_s16_m128i(_a);
+ int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFFFF : 0
+ uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+ // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+ int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+ int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+ // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+ // 'a') based on ltMask
+ int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+ // res = masked & (~zeroMask)
+ int16x8_t res = vbicq_s16(masked, zeroMask);
+ return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+// for i in 0..3
+// if b[i] < 0
+// r[i] := -a[i]
+// else if b[i] == 0
+// r[i] := 0
+// else
+// r[i] := a[i]
+// fi
+// done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+ int32x4_t a = vreinterpretq_s32_m128i(_a);
+ int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFFFFFFFF : 0
+ uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+ // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+ int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+ int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+ // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+ // 'a') based on ltMask
+ int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+ // res = masked & (~zeroMask)
+ int32x4_t res = vbicq_s32(masked, zeroMask);
+ return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+// FOR j := 0 to 3
+// i := j*16
+// IF b[i+15:i] < 0
+// dst[i+15:i] := -(a[i+15:i])
+// ELSE IF b[i+15:i] == 0
+// dst[i+15:i] := 0
+// ELSE
+// dst[i+15:i] := a[i+15:i]
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+ int16x4_t a = vreinterpret_s16_m64(_a);
+ int16x4_t b = vreinterpret_s16_m64(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFFFF : 0
+ uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+ // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+ int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+ int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+ // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+ // based on ltMask
+ int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+ // res = masked & (~zeroMask)
+ int16x4_t res = vbic_s16(masked, zeroMask);
+
+ return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+// FOR j := 0 to 1
+// i := j*32
+// IF b[i+31:i] < 0
+// dst[i+31:i] := -(a[i+31:i])
+// ELSE IF b[i+31:i] == 0
+// dst[i+31:i] := 0
+// ELSE
+// dst[i+31:i] := a[i+31:i]
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+ int32x2_t a = vreinterpret_s32_m64(_a);
+ int32x2_t b = vreinterpret_s32_m64(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFFFFFFFF : 0
+ uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+ // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+ int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+ int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+ // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+ // based on ltMask
+ int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+ // res = masked & (~zeroMask)
+ int32x2_t res = vbic_s32(masked, zeroMask);
+
+ return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+// FOR j := 0 to 7
+// i := j*8
+// IF b[i+7:i] < 0
+// dst[i+7:i] := -(a[i+7:i])
+// ELSE IF b[i+7:i] == 0
+// dst[i+7:i] := 0
+// ELSE
+// dst[i+7:i] := a[i+7:i]
+// FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+ int8x8_t a = vreinterpret_s8_m64(_a);
+ int8x8_t b = vreinterpret_s8_m64(_b);
+
+ // signed shift right: faster than vclt
+ // (b < 0) ? 0xFF : 0
+ uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+ // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+ int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+ int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+ // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+ // based on ltMask
+ int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+ // res = masked & (~zeroMask)
+ int8x8_t res = vbic_s8(masked, zeroMask);
+
+ return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u16(
+ vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u8(
+ vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+// r0 := (a0 + b0) / 2
+// r1 := (a1 + b1) / 2
+// ...
+// r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+// r0 := (a0 + b0) / 2
+// r1 := (a1 + b1) / 2
+// ...
+// r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+ return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+ vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+// r0 := a0 + b0
+// r1 := a1 + b1
+// r2 := a2 + b2
+// r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_f32(
+ vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[2];
+ c[0] = da[0] + db[0];
+ c[1] = da[1] + db[1];
+ return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+// dst[63:0] := a[63:0] + b[63:0]
+// dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[2];
+ c[0] = da[0] + db[0];
+ c[1] = da[1];
+ return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+// dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s64(
+ vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+ float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+ float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+ // the upper values in the result must be the remnants of <a>.
+ return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s64(
+ vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+// r0 := a0 + b0
+// r1 := a1 + b1
+// r2 := a2 + b2
+// r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+// r0 := SignedSaturate(a0 + b0)
+// r1 := SignedSaturate(a1 + b1)
+// ...
+// r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+// FOR j := 0 to 15
+// i := j*8
+// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+// r0 := (a0 * b0)[15:0]
+// r1 := (a1 * b1)[15:0]
+// ...
+// r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// tmp[31:0] := a[i+15:i] * b[i+15:i]
+// dst[i+15:i] := tmp[31:16]
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+// r0 := a0 * b0
+// r1 := a1 * b1
+// r2 := a2 * b2
+// r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_f32(
+ vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[2];
+ c[0] = da[0] * db[0];
+ c[1] = da[1] * db[1];
+ return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+ return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+// dst[31:0] := a[31:0] * b[31:0]
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+ // vmull_u32 upcasts instead of masking, so we downcast.
+ uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+ uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+ return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+// dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u64(vget_low_u64(
+ vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+ // vmull_s32 upcasts instead of masking, so we downcast.
+ int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+ int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+ return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+// r0 := (a0 * b0) + (a1 * b1)
+// r1 := (a2 * b2) + (a3 * b3)
+// r2 := (a4 * b4) + (a5 * b5)
+// r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+ int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+ vget_low_s16(vreinterpretq_s16_m128i(b)));
+ int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+ vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+ int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+ int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+ return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+// ...
+// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+ // Has issues due to saturation
+ // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+ // Multiply
+ int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+ vget_low_s16(vreinterpretq_s16_m128i(b)));
+ int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+ vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+ // Rounding narrowing shift right
+ // narrow = (int16_t)((mul + 16384) >> 15);
+ int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+ int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+ // Join together
+ return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+// FOR j := 0 to 7
+// i := j*16
+// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+// a[i+7:i]*b[i+7:i] )
+// ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+ uint8x16_t a = vreinterpretq_u8_m128i(_a);
+ int8x16_t b = vreinterpretq_s8_m128i(_b);
+ int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+ vmovl_s8(vget_low_s8(b)));
+ int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+ vmovl_s8(vget_high_s8(b)));
+ return vreinterpretq_m128i_s16(
+ vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+ // This would be much simpler if x86 would choose to zero extend OR sign
+ // extend, not both. This could probably be optimized better.
+ uint16x8_t a = vreinterpretq_u16_m128i(_a);
+ int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+ // Zero extend a
+ int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+ int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+ // Sign extend by shifting left then shifting right.
+ int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+ int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+ // multiply
+ int16x8_t prod1 = vmulq_s16(a_even, b_even);
+ int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+ // saturated add
+ return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+ vreinterpretq_f32_m128(b),
+ vreinterpretq_f32_m128(a)));
+#else
+ return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+ __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+ return _mm_fmadd_ps(b, mask, a);
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[] = {da[0] + da[1], db[0] + db[1]};
+ return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+ uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+ uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+ uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+ uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+ return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+ uint16x4_t t =
+ vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+ uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+ return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+// ENDFOR
+// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+// r0 := a0 / b0
+// r1 := a1 / b1
+// r2 := a2 / b2
+// r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+ return vreinterpretq_m128_f32(
+ vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+ float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+ // Additional Netwon-Raphson iteration for accuracy
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+ return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+ float32_t value =
+ vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := 64*j
+// dst[i+63:i] := a[i+63:i] / b[i+63:i]
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ double *da = (double *) &a;
+ double *db = (double *) &b;
+ double c[2];
+ c[0] = da[0] / db[0];
+ c[1] = da[1] / db[1];
+ return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ float64x2_t tmp =
+ vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+ return vreinterpretq_m128d_f64(
+ vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+ return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+ float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+ // Additional Netwon-Raphson iteration for accuracy
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+ return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+// dst[31:0] := (1.0 / a[31:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+ return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+// r0 := sqrt(a0)
+// r1 := sqrt(a1)
+// r2 := sqrt(a2)
+// r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+ float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+ // Test for vrsqrteq_f32(0) -> positive infinity case.
+ // Change to zero, so that s * 1/sqrt(s) result is zero too.
+ const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+ const uint32x4_t div_by_zero =
+ vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+ recip = vreinterpretq_f32_u32(
+ vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+ // Additional Netwon-Raphson iteration for accuracy
+ recip = vmulq_f32(
+ vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+ recip);
+ recip = vmulq_f32(
+ vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+ recip);
+
+ // sqrt(s) = s * 1/sqrt(s)
+ return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+ return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+ float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+ float32x4_t sq = vrecpeq_f32(recipsq);
+ return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+ float32_t value =
+ vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+ float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_RSQRT
+ // Additional Netwon-Raphson iteration for accuracy
+ out = vmulq_f32(
+ out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+ out = vmulq_f32(
+ out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+ return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+ return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s16(
+ vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+ float32x4_t _a = vreinterpretq_f32_m128(a);
+ float32x4_t _b = vreinterpretq_f32_m128(b);
+ return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+ return vreinterpretq_m128_f32(
+ vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u8(
+ vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s16(
+ vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+ float32x4_t _a = vreinterpretq_f32_m128(a);
+ float32x4_t _b = vreinterpretq_f32_m128(b);
+ return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+ return vreinterpretq_m128_f32(
+ vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u8(
+ vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+// FOR j := 0 to 7
+// i := j*8
+// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+ float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+ float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+// r0 := (a0 > b0) ? a0 : b0
+// r1 := (a1 > b1) ? a1 : b1
+// r2 := (a2 > b2) ? a2 : b2
+// r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+// r0 := (a0 < b0) ? a0 : b0
+// r1 := (a1 < b1) ? a1 : b1
+// r2 := (a2 < b2) ? a2 : b2
+// r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s32(
+ vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u32(
+ vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u32(
+ vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_u16(vshrn_n_u32(
+ vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+// r0 := (a0 * b0)[31:16]
+// r1 := (a1 * b1)[31:16]
+// ...
+// r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+ /* FIXME: issue with large values because of result saturation */
+ // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+ // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+ // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+ int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+ int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+ int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+ int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+ int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+ uint16x8x2_t r =
+ vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+ return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+ uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+ uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+ uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+ uint32x4_t ab7654 =
+ vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+ uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+ vreinterpretq_u16_u32(ab7654));
+ return vreinterpretq_m128i_u16(r);
+#else
+ uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+ uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+ uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+ uint16x8x2_t r =
+ vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+ return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(
+ vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_f32(
+ vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+ int16x8_t a = vreinterpretq_s16_m128i(_a);
+ int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+ return vreinterpretq_m128i_s16(
+ vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+ vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(vsubq_f32(
+ vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+ vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+ float32x4x2_t c =
+ vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+ return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s16(
+ vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+ return vreinterpret_m64_s32(
+ vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+ int32x4_t a = vreinterpretq_s32_m128i(_a);
+ int32x4_t b = vreinterpretq_s32_m128i(_b);
+ // Interleave using vshrn/vmovn
+ // [a0|a2|a4|a6|b0|b2|b4|b6]
+ // [a1|a3|a5|a7|b1|b3|b5|b7]
+ int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+ int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+ // Subtract
+ return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+ int16x8_t a = vreinterpretq_s16_m128i(_a);
+ int16x8_t b = vreinterpretq_s16_m128i(_b);
+ return vreinterpretq_s64_s16(
+ vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+ int32x4_t a = vreinterpretq_s32_m128i(_a);
+ int32x4_t b = vreinterpretq_s32_m128i(_b);
+ // Interleave using vshrn/vmovn
+ // [a0|a2|a4|a6|b0|b2|b4|b6]
+ // [a1|a3|a5|a7|b1|b3|b5|b7]
+ int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+ int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+ // Saturated add
+ return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+ int16x8_t a = vreinterpretq_s16_m128i(_a);
+ int16x8_t b = vreinterpretq_s16_m128i(_b);
+ return vreinterpretq_s64_s16(
+ vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+ int32x4_t a = vreinterpretq_s32_m128i(_a);
+ int32x4_t b = vreinterpretq_s32_m128i(_b);
+ // Interleave using vshrn/vmovn
+ // [a0|a2|a4|a6|b0|b2|b4|b6]
+ // [a1|a3|a5|a7|b1|b3|b5|b7]
+ int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+ int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+ // Saturated subtract
+ return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+ int32x4_t a = vreinterpretq_s32_m128i(_a);
+ int32x4_t b = vreinterpretq_s32_m128i(_b);
+ return vreinterpretq_m128i_s32(
+ vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+ vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+ int64x2_t a = vreinterpretq_s64_m128i(_a);
+ int64x2_t b = vreinterpretq_s64_m128i(_b);
+ // Interleave using vshrn/vmovn
+ // [a0|a2|b0|b2]
+ // [a1|a2|b1|b3]
+ int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+ int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+ // Subtract
+ return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+ y -= *c;
+ float t = *sum + y;
+ *c = (t - *sum) - y;
+ *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+ /* shortcuts */
+ if (imm == 0xFF) {
+ return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+ }
+ if (imm == 0x7F) {
+ float32x4_t m = _mm_mul_ps(a, b);
+ m[3] = 0;
+ return _mm_set1_ps(vaddvq_f32(m));
+ }
+#endif
+
+ float s = 0, c = 0;
+ float32x4_t f32a = vreinterpretq_f32_m128(a);
+ float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+ /* To improve the accuracy of floating-point summation, Kahan algorithm
+ * is used for each operation.
+ */
+ if (imm & (1 << 4))
+ _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+ if (imm & (1 << 5))
+ _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+ if (imm & (1 << 6))
+ _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+ if (imm & (1 << 7))
+ _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+ s += c;
+
+ float32x4_t res = {
+ (imm & 0x1) ? s : 0,
+ (imm & 0x2) ? s : 0,
+ (imm & 0x4) ? s : 0,
+ (imm & 0x8) ? s : 0,
+ };
+ return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(
+ vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+// r0 := (a0 > b0) ? 0xffffffff : 0x0
+// r1 := (a1 > b1) ? 0xffffffff : 0x0
+// r2 := (a2 > b2) ? 0xffffffff : 0x0
+// r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(
+ vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(
+ vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+// r0 := (a0 <= b0) ? 0xffffffff : 0x0
+// r1 := (a1 <= b1) ? 0xffffffff : 0x0
+// r2 := (a2 <= b2) ? 0xffffffff : 0x0
+// r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(
+ vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+ return vreinterpretq_m128_u32(vmvnq_u32(
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+ return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+ return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+ return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+ return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+ return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+ return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+ return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+ return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_u64(
+ vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+ uint32x4_t cmp =
+ vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+ uint32x4_t swapped = vrev64q_u32(cmp);
+ return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u32(
+ vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_u64(
+ vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+ // ARMv7 lacks vceqq_u64
+ // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+ uint32x4_t cmp =
+ vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+ uint32x4_t swapped = vrev64q_u32(cmp);
+ return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+// r0 := (a0 > b0) ? 0xff : 0x0
+// r1 := (a1 > b1) ? 0xff : 0x0
+// ...
+// r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+// r0 := (a0 < b0) ? 0xffff : 0x0
+// r1 := (a1 < b1) ? 0xffff : 0x0
+// ...
+// r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+// r0 := (a0 > b0) ? 0xffff : 0x0
+// r1 := (a1 > b1) ? 0xffff : 0x0
+// ...
+// r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u32(
+ vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u32(
+ vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_u64(
+ vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+ // ARMv7 lacks vcgtq_s64.
+ // This is based off of Clang's SSE2 polyfill:
+ // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+ // Mask the sign bit out since we need a signed AND an unsigned comparison
+ // and it is ugly to try and split them.
+ int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+ int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+ int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+ // Check if a > b
+ int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+ // Copy upper mask to lower mask
+ // a_hi > b_hi
+ int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+ // Copy lower mask to upper mask
+ // a_lo > b_lo
+ int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+ // Compare for equality
+ int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+ // Copy upper mask to lower mask
+ // a_hi == b_hi
+ int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+ // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+ int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+ return vreinterpretq_m128i_s64(ret);
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+ // Note: NEON does not have ordered compare builtin
+ // Need to compare a eq a and b eq b to check for NaN
+ // Do AND of results to get final
+ uint32x4_t ceqaa =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t ceqbb =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+ uint32x4_t f32a =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t f32b =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect! If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+ uint32x4_t a_lt_b =
+ vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+ // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+ // vreinterpretq_f32_m128(b)), 0);
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+ uint32x4_t a_gt_b =
+ vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+ // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+ // vreinterpretq_f32_m128(b)), 0);
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+ uint32x4_t a_le_b =
+ vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+ // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+ // vreinterpretq_f32_m128(b)), 0);
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+ uint32x4_t a_ge_b =
+ vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+ // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+ // vreinterpretq_f32_m128(b)), 0);
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+ uint32x4_t a_eq_b =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+ // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+ // vreinterpretq_f32_m128(b)), 0);
+ uint32x4_t a_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+ uint32x4_t b_not_nan =
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+ uint32x4_t a_neq_b = vmvnq_u32(
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+ return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions. We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+// dst[95:64] := a[95:64]
+// dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+ return vreinterpretq_m128_f32(
+ vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+ vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+ return vreinterpretq_m128_f32(
+ vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+ return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+ float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ float32_t diff = data - floor(data);
+ if (diff > 0.5)
+ return (int32_t) ceil(data);
+ if (unlikely(diff == 0.5)) {
+ int32_t f = (int32_t) floor(data);
+ int32_t c = (int32_t) ceil(data);
+ return c & 1 ? f : c;
+ }
+ return (int32_t) floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// m := j*32
+// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+ return vreinterpretq_m128_f32(
+ vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+// dst[95:64] := a[95:64]
+// dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+ return vreinterpretq_m128_f32(
+ vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+ vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+ return vreinterpretq_m128_f32(vcvtq_f32_s32(
+ vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+// FOR j := 0 to 3
+// i := j*8
+// m := j*32
+// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+ return vreinterpretq_m128_f32(vcvtq_f32_s32(
+ vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+// FOR j := 0 to 3
+// i := j*16
+// m := j*32
+// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+ return vreinterpretq_m128_f32(
+ vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+// FOR j := 0 to 3
+// i := j*8
+// m := j*32
+// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+ return vreinterpretq_m128_f32(vcvtq_f32_u32(
+ vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+ return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+ return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+ double ret = *((double *) &a);
+ return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+ return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+ uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
+ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+ return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+ uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
+ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+ return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+ uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
+ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
+ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+ return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+ int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
+ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+ return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+ int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
+ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+ return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+ int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
+ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
+ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+ return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+ return vreinterpretq_m128i_s32(
+ vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+ int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
+ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+ return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+ return vreinterpretq_m128i_u32(
+ vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+ uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
+ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+ return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+ return vreinterpretq_m128i_u64(
+ vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+ return vreinterpretq_m128i_s64(
+ vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+// r0 := (int) a0
+// r1 := (int) a1
+// r2 := (int) a2
+// r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+ uint32x4_t signmask = vdupq_n_u32(0x80000000);
+ float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+ vdupq_n_f32(0.5f)); /* +/- 0.5 */
+ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+ vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+ int32x4_t r_trunc =
+ vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+ vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+ vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+ float32x4_t delta = vsubq_f32(
+ vreinterpretq_f32_m128(a),
+ vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+ return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+ return vreinterpret_m64_s16(
+ vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+// dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+ return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+// dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+ return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+// dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+// r0 := a
+// r1 := 0x0
+// r2 := 0x0
+// r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+ return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+// r0 := a
+// r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+ return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+ return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+ return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+ return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+ return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+ return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+ return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+// dst[63:0] := a[63:0]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+ return vreinterpretq_m128d_f32(vcombine_f32(
+ vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+// dst[63:0] := MEM[mem_addr+63:mem_addr]
+// dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+ return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+// dst[31:0] := MEM[mem_addr+31:mem_addr]
+// dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+ return vreinterpretq_m128i_s32(
+ vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+// i := 32*j
+// k := 64*j
+// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+// ENDFOR
+// dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+ float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+ return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+ float a0 = (float) ((double *) &a)[0];
+ float a1 = (float) ((double *) &a)[1];
+ return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+// dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+ return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+ return ((double *) &a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+// i := 64*j
+// k := 32*j
+// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+ double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+ double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+ return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+ return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+ return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+ // Use a signed shift right to create a mask with the sign bit
+ uint32x4_t mask =
+ vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+ float32x4_t a = vreinterpretq_f32_m128(_a);
+ float32x4_t b = vreinterpretq_f32_m128(_b);
+ return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+ const uint32_t ALIGN_STRUCT(16)
+ data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+ ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+ ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+ ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+ uint32x4_t mask = vld1q_u32(data);
+ float32x4_t a = vreinterpretq_f32_m128(_a);
+ float32x4_t b = vreinterpretq_f32_m128(_b);
+ return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+ uint64x2_t mask =
+ vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+ float64x2_t a = vreinterpretq_f64_m128d(_a);
+ float64x2_t b = vreinterpretq_f64_m128d(_b);
+ return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+ uint64x2_t a = vreinterpretq_u64_m128d(_a);
+ uint64x2_t b = vreinterpretq_u64_m128d(_b);
+ return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+typedef struct {
+ uint16_t res0;
+ uint8_t res1 : 6;
+ uint8_t bit22 : 1;
+ uint8_t bit23 : 1;
+ uint8_t res2;
+#if defined(__aarch64__)
+ uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+ union {
+ fpcr_bitfield field;
+#if defined(__aarch64__)
+ uint64_t value;
+#else
+ uint32_t value;
+#endif
+ } r;
+
+#if defined(__aarch64__)
+ asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+ asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+ switch (rounding) {
+ case _MM_ROUND_TOWARD_ZERO:
+ r.field.bit22 = 1;
+ r.field.bit23 = 1;
+ break;
+ case _MM_ROUND_DOWN:
+ r.field.bit22 = 0;
+ r.field.bit23 = 1;
+ break;
+ case _MM_ROUND_UP:
+ r.field.bit22 = 1;
+ r.field.bit23 = 0;
+ break;
+ default: //_MM_ROUND_NEAREST
+ r.field.bit22 = 0;
+ r.field.bit23 = 0;
+ }
+
+#if defined(__aarch64__)
+ asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+ asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
+#endif
+}
+
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+ _MM_SET_ROUNDING_MODE(a);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+ switch (rounding) {
+ case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+ return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+ case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+ return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+ case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+ return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+ case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+ return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+ default: //_MM_FROUND_CUR_DIRECTION
+ return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+ }
+#else
+ float *v_float = (float *) &a;
+ __m128 zero, neg_inf, pos_inf;
+
+ switch (rounding) {
+ case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+ return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+ case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+ return (__m128){floorf(v_float[0]), floorf(v_float[1]),
+ floorf(v_float[2]), floorf(v_float[3])};
+ case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+ return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+ ceilf(v_float[3])};
+ case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+ zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+ neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+ floorf(v_float[2]), floorf(v_float[3]));
+ pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+ ceilf(v_float[2]), ceilf(v_float[3]));
+ return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+ default: //_MM_FROUND_CUR_DIRECTION
+ return (__m128){roundf(v_float[0]), roundf(v_float[1]),
+ roundf(v_float[2]), roundf(v_float[3])};
+ }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := 32*j
+// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__)
+ return vreinterpret_m64_s32(
+ vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
+#else
+ return vreinterpret_m64_s32(
+ vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
+ _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+// FOR j := 0 to 1
+// i := 32*j
+// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+ return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+// dst[31:0] := CEIL(b[31:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(
+ a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+ return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+// dst[31:0] := FLOOR(b[31:0])
+// dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+ return _mm_move_ss(
+ a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+// dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+// r0 := a0 >> count
+// r1 := a1 >> count
+// ...
+// r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+ int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+ if (unlikely(c > 15))
+ return _mm_cmplt_epi16(a, _mm_setzero_si128());
+ return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+// r0 := a0 >> count
+// r1 := a1 >> count
+// r2 := a2 >> count
+// r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+ int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+ if (unlikely(c > 31))
+ return _mm_cmplt_epi32(a, _mm_setzero_si128());
+ return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s8(
+ vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+ vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+// r0 := UnsignedSaturate(a0)
+// r1 := UnsignedSaturate(a1)
+// ...
+// r7 := UnsignedSaturate(a7)
+// r8 := UnsignedSaturate(b0)
+// r9 := UnsignedSaturate(b1)
+// ...
+// r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+ vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+// r0 := SignedSaturate(a0)
+// r1 := SignedSaturate(a1)
+// r2 := SignedSaturate(a2)
+// r3 := SignedSaturate(a3)
+// r4 := SignedSaturate(b0)
+// r5 := SignedSaturate(b1)
+// r6 := SignedSaturate(b2)
+// r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_s16(
+ vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+ vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+// r0 := UnsignedSaturate(a0)
+// r1 := UnsignedSaturate(a1)
+// r2 := UnsignedSaturate(a2)
+// r3 := UnsignedSaturate(a3)
+// r4 := UnsignedSaturate(b0)
+// r5 := UnsignedSaturate(b1)
+// r6 := UnsignedSaturate(b2)
+// r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u16(
+ vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+ vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+// r0 := a0
+// r1 := b0
+// r2 := a1
+// r3 := b1
+// ...
+// r14 := a7
+// r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s8(
+ vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+ int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+ int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+ int8x8x2_t result = vzip_s8(a1, b1);
+ return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+// r0 := a0
+// r1 := b0
+// r2 := a1
+// r3 := b1
+// r4 := a2
+// r5 := b2
+// r6 := a3
+// r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s16(
+ vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+ int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+ int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+ int16x4x2_t result = vzip_s16(a1, b1);
+ return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+// r0 := a0
+// r1 := b0
+// r2 := a1
+// r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s32(
+ vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+ int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+ int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+ int32x2x2_t result = vzip_s32(a1, b1);
+ return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+ int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+ int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+ return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+// r0 := a0
+// r1 := b0
+// r2 := a1
+// r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(
+ vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+ float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+ float32x2x2_t result = vzip_f32(a1, b1);
+ return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+// dst[63:0] := src1[63:0]
+// dst[127:64] := src2[63:0]
+// RETURN dst[127:0]
+// }
+// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ return vreinterpretq_m128d_s64(
+ vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+ vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+// dst[63:0] := src1[127:64]
+// dst[127:64] := src2[127:64]
+// RETURN dst[127:0]
+// }
+// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128d_f64(
+ vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+ return vreinterpretq_m128d_s64(
+ vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+ vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+// r0 := a2
+// r1 := b2
+// r2 := a3
+// r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128_f32(
+ vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+ float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+ float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+ float32x2x2_t result = vzip_f32(a1, b1);
+ return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+// r0 := a8
+// r1 := b8
+// r2 := a9
+// r3 := b9
+// ...
+// r14 := a15
+// r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s8(
+ vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+ int8x8_t a1 =
+ vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+ int8x8_t b1 =
+ vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+ int8x8x2_t result = vzip_s8(a1, b1);
+ return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+// r0 := a4
+// r1 := b4
+// r2 := a5
+// r3 := b5
+// r4 := a6
+// r5 := b6
+// r6 := a7
+// r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s16(
+ vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+ int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+ int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+ int16x4x2_t result = vzip_s16(a1, b1);
+ return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+ return vreinterpretq_m128i_s32(
+ vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+ int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+ int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+ int32x2x2_t result = vzip_s32(a1, b1);
+ return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+// r0 := a1
+// r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+ int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+ int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+ return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+// index[2:0] := 0
+// min[15:0] := a[15:0]
+// FOR j := 0 to 7
+// i := j*16
+// IF a[i+15:i] < min[15:0]
+// index[2:0] := j
+// min[15:0] := a[i+15:i]
+// FI
+// ENDFOR
+// dst[15:0] := min[15:0]
+// dst[18:16] := index[2:0]
+// dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+ __m128i dst;
+ uint16_t min, idx = 0;
+ // Find the minimum value
+#if defined(__aarch64__)
+ min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+ __m64 tmp;
+ tmp = vreinterpret_m64_u16(
+ vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+ vget_high_u16(vreinterpretq_u16_m128i(a))));
+ tmp = vreinterpret_m64_u16(
+ vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+ tmp = vreinterpret_m64_u16(
+ vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+ min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+ // Get the index of the minimum value
+ int i;
+ for (i = 0; i < 8; i++) {
+ if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+ idx = (uint16_t) i;
+ break;
+ }
+ a = _mm_srli_si128(a, 2);
+ }
+ // Generate result
+ dst = _mm_setzero_si128();
+ dst = vreinterpretq_m128i_u16(
+ vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+ dst = vreinterpretq_m128i_u16(
+ vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+ return dst;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+ int64x2_t s64 =
+ vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+ vreinterpretq_s64_m128i(b));
+ return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+ int64x2_t s64 =
+ vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+ return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+// __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s8( \
+ vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+ })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+ vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+// __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s16( \
+ vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+ })
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm) \
+ __extension__({ \
+ vreinterpret_m64_s16( \
+ vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+ })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+// __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s32( \
+ vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+ })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+ vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+// __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm) \
+ __extension__({ \
+ vreinterpretq_m128i_s64( \
+ vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+ })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+ return __builtin_popcount(a);
+#else
+ return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+ uint32_t count = 0;
+ uint8x8_t input_val, count8x8_val;
+ uint16x4_t count16x4_val;
+ uint32x2_t count32x2_val;
+
+ input_val = vld1_u8((uint8_t *) &a);
+ count8x8_val = vcnt_u8(input_val);
+ count16x4_val = vpaddl_u8(count8x8_val);
+ count32x2_val = vpaddl_u16(count16x4_val);
+
+ vst1_u32(&count, count32x2_val);
+ return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+ return __builtin_popcountll(a);
+#else
+ return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+ uint64_t count = 0;
+ uint8x8_t input_val, count8x8_val;
+ uint16x4_t count16x4_val;
+ uint32x2_t count32x2_val;
+ uint64x1_t count64x1_val;
+
+ input_val = vld1_u8((uint8_t *) &a);
+ count8x8_val = vcnt_u8(input_val);
+ count16x4_val = vpaddl_u8(count8x8_val);
+ count32x2_val = vpaddl_u16(count16x4_val);
+ count64x1_val = vpaddl_u32(count32x2_val);
+ vst1_u64(&count, count64x1_val);
+ return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+ do { \
+ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
+ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
+ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
+ vget_low_f32(ROW23.val[0])); \
+ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
+ vget_low_f32(ROW23.val[1])); \
+ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
+ vget_high_f32(ROW23.val[0])); \
+ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
+ vget_high_f32(ROW23.val[1])); \
+ } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+ poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+ poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+ return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+ poly8x8_t a = vreinterpret_p8_u64(_a);
+ poly8x8_t b = vreinterpret_p8_u64(_b);
+
+ // Masks
+ uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+ vcreate_u8(0x00000000ffffffff));
+ uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+ vcreate_u8(0x0000000000000000));
+
+ // Do the multiplies, rotating with vext to get all combinations
+ uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
+ uint8x16_t e =
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
+ uint8x16_t f =
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
+ uint8x16_t g =
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
+ uint8x16_t h =
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
+ uint8x16_t i =
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
+ uint8x16_t j =
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
+ uint8x16_t k =
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
+
+ // Add cross products
+ uint8x16_t l = veorq_u8(e, f); // L = E + F
+ uint8x16_t m = veorq_u8(g, h); // M = G + H
+ uint8x16_t n = veorq_u8(i, j); // N = I + J
+
+ // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+ // instructions.
+#if defined(__aarch64__)
+ uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+ vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+ uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+ vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+ uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+ vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+ uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+ vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+ uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+ uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+ uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+ uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+ // t0 = (L) (P0 + P1) << 8
+ // t1 = (M) (P2 + P3) << 16
+ uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+ uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+ uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+ // t2 = (N) (P4 + P5) << 24
+ // t3 = (K) (P6 + P7) << 32
+ uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+ uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+ uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+ // De-interleave
+#if defined(__aarch64__)
+ uint8x16_t t0 = vreinterpretq_u8_u64(
+ vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+ uint8x16_t t1 = vreinterpretq_u8_u64(
+ vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+ uint8x16_t t2 = vreinterpretq_u8_u64(
+ vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+ uint8x16_t t3 = vreinterpretq_u8_u64(
+ vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+ uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+ uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+ uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+ uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+ // Shift the cross products
+ uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
+ uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
+ uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
+ uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
+
+ // Accumulate the products
+ uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+ uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+ uint8x16_t mix = veorq_u8(d, cross1);
+ uint8x16_t r = veorq_u8(mix, cross2);
+ return vreinterpretq_u64_u8(r);
+}
+#endif // ARMv7 polyfill
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+ uint64x2_t a = vreinterpretq_u64_m128i(_a);
+ uint64x2_t b = vreinterpretq_u64_m128i(_b);
+ switch (imm & 0x11) {
+ case 0x00:
+ return vreinterpretq_m128i_u64(
+ _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+ case 0x01:
+ return vreinterpretq_m128i_u64(
+ _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+ case 0x10:
+ return vreinterpretq_m128i_u64(
+ _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+ case 0x11:
+ return vreinterpretq_m128i_u64(
+ _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+ default:
+ abort();
+ }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w) \
+ { \
+ w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+ w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+ w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+ w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+ w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+ w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+ w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+ w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+ w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+ w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+ w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+ w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+ w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+ w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+ w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+ w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+ w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+ w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+ w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+ w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+ w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+ w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+ w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+ w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+ w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+ w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+ w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+ w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+ w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+ w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+ w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+ w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+ w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+ w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+ w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+ w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+ w(0xb0), w(0x54), w(0xbb), w(0x16) \
+ }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+ static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+ 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+ 0xc, 0x1, 0x6, 0xb};
+ static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+ 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+ uint8x16_t v;
+ uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+ // shift rows
+ w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+ // sub bytes
+ v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+ // mix columns
+ w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+ w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+ w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+ // add round key
+ return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
+ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+ (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+ SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+ SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+ SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+ static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+ SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+ SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+ SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+ SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+ };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+ uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+ uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+ uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+ uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+ __m128i out = _mm_set_epi32(
+ (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+ aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+ (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+ aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+ (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+ aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+ (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+ aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+ return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+ /* FIXME: optimized for NEON */
+ uint8_t v[4][4] = {
+ [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+ [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+ [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+ [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+ SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+ };
+ for (int i = 0; i < 16; i++)
+ vreinterpretq_nth_u8_m128i(a, i) =
+ v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+ return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+ uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+ uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+ for (int i = 0; i < 4; ++i) {
+ ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+ ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+ }
+ return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+ ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+ return vreinterpretq_m128i_u8(
+ vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+ vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+ return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+ vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+ RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+ // AESE does ShiftRows and SubBytes on A
+ uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+ uint8x16_t dest = {
+ // Undo ShiftRows step from AESE and extract X1 and X3
+ u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
+ u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
+ u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
+ u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
+ };
+ uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+ return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+ __sync_synchronize();
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+ __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+ vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches. If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+ __builtin_nontemporal_store(a, p);
+#else
+ vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+// dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+ return __builtin_nontemporal_load(p);
+#else
+ return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+ (void) p;
+ // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+ void *ptr;
+ if (align == 1)
+ return malloc(size);
+ if (align == 2 || (sizeof(void *) == 8 && align == 4))
+ align = sizeof(void *);
+ if (!posix_memalign(&ptr, align, size))
+ return ptr;
+ return NULL;
+}
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+ free(addr);
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+ __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+ : [c] "+r"(crc)
+ : [v] "r"(v));
+#else
+ crc ^= v;
+ for (int bit = 0; bit < 8; bit++) {
+ if (crc & 1)
+ crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+ else
+ crc = (crc >> 1);
+ }
+#endif
+ return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+ __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+ : [c] "+r"(crc)
+ : [v] "r"(v));
+#else
+ crc = _mm_crc32_u8(crc, v & 0xff);
+ crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+ return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+ __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+ : [c] "+r"(crc)
+ : [v] "r"(v));
+#else
+ crc = _mm_crc32_u16(crc, v & 0xffff);
+ crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+ return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+ __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+ : [c] "+r"(crc)
+ : [v] "r"(v));
+#else
+ crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+ crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+ return crc;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h
new file mode 100644
index 0000000000..d3100306ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
+
diff --git a/thirdparty/embree/common/simd/avx512.h b/thirdparty/embree/common/simd/avx512.h
new file mode 100644
index 0000000000..d43bbacea1
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Prefetching
+ ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1 0
+#define PFHINT_L2 1
+#define PFHINT_NT 2
+
+ template<const unsigned int mode>
+ __forceinline void prefetch(const void * __restrict__ const m)
+ {
+ if (mode == PFHINT_L1)
+ _mm_prefetch((const char*)m,_MM_HINT_T0);
+ else if (mode == PFHINT_L2)
+ _mm_prefetch((const char*)m,_MM_HINT_T1);
+ else if (mode == PFHINT_NT)
+ _mm_prefetch((const char*)m,_MM_HINT_NTA);
+ }
+}
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
new file mode 100644
index 0000000000..195506b530
--- /dev/null
+++ b/thirdparty/embree/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__)
+# include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+# include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+# include "avx512.h"
+#endif
+
+namespace embree
+{
+ template <int N>
+ __forceinline vbool<N> isfinite(const vfloat<N>& v)
+ {
+ return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+ & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+ }
+
+ /* foreach unique */
+ template<typename vbool, typename vint, typename Closure>
+ __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+ {
+ vbool valid1 = valid0;
+ while (any(valid1)) {
+ const int j = int(bsf(movemask(valid1)));
+ const int i = vi[j];
+ const vbool valid2 = valid1 & (i == vi);
+ valid1 = andn(valid1, valid2);
+ closure(valid2, i);
+ }
+ }
+
+ /* returns the next unique value i in vi and the corresponding valid_i mask */
+ template<typename vbool, typename vint>
+ __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+ {
+ assert(any(valid));
+ const int j = int(bsf(movemask(valid)));
+ const int i = vi[j];
+ valid_i = valid & (i == vi);
+ valid = andn(valid, valid_i);
+ return i;
+ }
+
+ /* foreach unique index */
+ template<typename vbool, typename vint, typename Closure>
+ __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+ {
+ vbool valid1 = valid0;
+ while (any(valid1)) {
+ const int j = int(bsf(movemask(valid1)));
+ const int i = vi[j];
+ const vbool valid2 = valid1 & (i == vi);
+ valid1 = andn(valid1, valid2);
+ closure(valid2, i, j);
+ }
+ }
+
+ /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+ template<typename vbool, typename vint>
+ __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+ {
+ assert(any(valid));
+ const int j = int(bsf(movemask(valid)));
+ const int i = vi[j];
+ valid_i = valid & (i == vi);
+ valid = andn(valid, valid_i);
+ return j;
+ }
+
+ template<typename Closure>
+ __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+ {
+ __aligned(64) int U[2*VSIZEX];
+ __aligned(64) int V[2*VSIZEX];
+ int index = 0;
+ for (int y=y0; y<y1; y++) {
+ const bool lasty = y+1>=y1;
+ const vintx vy = y;
+ for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+ const bool lastx = x+VSIZEX >= x1;
+ vintx vx = x+vintx(step);
+ vintx::storeu(&U[index], vx);
+ vintx::storeu(&V[index], vy);
+ const int dx = min(x1-x,VSIZEX);
+ index += dx;
+ x += dx;
+ if (index >= VSIZEX || (lastx && lasty)) {
+ const vboolx valid = vintx(step) < vintx(index);
+ closure(valid, vintx::load(U), vintx::load(V));
+ x-= max(0, index-VSIZEX);
+ index = 0;
+ }
+ }
+ }
+ }
+}
diff --git a/thirdparty/embree/common/simd/sse.cpp b/thirdparty/embree/common/simd/sse.cpp
new file mode 100644
index 0000000000..535d6943d8
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree
+{
+ const __m128 mm_lookupmask_ps[16] = {
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+ };
+
+ const __m128d mm_lookupmask_pd[4] = {
+ _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+ _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+ _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+ _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+ };
+
+}
diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h
new file mode 100644
index 0000000000..1465fb4fb0
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree
+{
+#if defined(__SSE4_1__)
+ __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) {
+ return _mm_blendv_ps(f,t,mask);
+ }
+#else
+ __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) {
+ return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f));
+ }
+#endif
+
+ extern const __m128 mm_lookupmask_ps[16];
+ extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h
new file mode 100644
index 0000000000..9b98d326be
--- /dev/null
+++ b/thirdparty/embree/common/simd/varying.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+ /* Varying numeric types */
+ template<int N>
+ struct vfloat_impl
+ {
+ union { float f[N]; int i[N]; };
+ __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+ __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; }
+ };
+
+ template<int N>
+ struct vdouble_impl
+ {
+ union { double f[N]; long long i[N]; };
+ __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+ __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; }
+ };
+
+ template<int N>
+ struct vint_impl
+ {
+ int i[N];
+ __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+ __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; }
+ };
+
+ template<int N>
+ struct vuint_impl
+ {
+ unsigned int i[N];
+ __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+ __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; }
+ };
+
+ template<int N>
+ struct vllong_impl
+ {
+ long long i[N];
+ __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+ __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; }
+ };
+
+ /* Varying bool types */
+ template<int N> struct vboolf_impl { int i[N]; }; // for float/int
+ template<int N> struct vboold_impl { long long i[N]; }; // for double/long long
+
+ /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+ const int VSIZEX = 8; // default size
+ const int VSIZEL = 16; // large size
+#elif defined(__AVX__)
+ const int VSIZEX = 8;
+ const int VSIZEL = 8;
+#else
+ const int VSIZEX = 4;
+ const int VSIZEL = 4;
+#endif
+
+ template<int N>
+ struct vtypes {
+ using vbool = vboolf_impl<N>;
+ using vboolf = vboolf_impl<N>;
+ using vboold = vboold_impl<N>;
+ using vint = vint_impl<N>;
+ using vuint = vuint_impl<N>;
+ using vllong = vllong_impl<N>;
+ using vfloat = vfloat_impl<N>;
+ using vdouble = vdouble_impl<N>;
+ };
+
+ template<>
+ struct vtypes<1> {
+ using vbool = bool;
+ using vboolf = bool;
+ using vboold = bool;
+ using vint = int;
+ using vuint = unsigned int;
+ using vllong = long long;
+ using vfloat = float;
+ using vdouble = double;
+ };
+
+ /* Aliases to default types */
+ template<int N> using vbool = typename vtypes<N>::vbool;
+ template<int N> using vboolf = typename vtypes<N>::vboolf;
+ template<int N> using vboold = typename vtypes<N>::vboold;
+ template<int N> using vint = typename vtypes<N>::vint;
+ template<int N> using vuint = typename vtypes<N>::vuint;
+ template<int N> using vllong = typename vtypes<N>::vllong;
+ template<int N> using vreal = typename vtypes<N>::vfloat;
+ template<int N> using vfloat = typename vtypes<N>::vfloat;
+ template<int N> using vdouble = typename vtypes<N>::vdouble;
+
+ /* 4-wide shortcuts */
+ typedef vfloat<4> vfloat4;
+ typedef vdouble<4> vdouble4;
+ typedef vreal<4> vreal4;
+ typedef vint<4> vint4;
+ typedef vuint<4> vuint4;
+ typedef vllong<4> vllong4;
+ typedef vbool<4> vbool4;
+ typedef vboolf<4> vboolf4;
+ typedef vboold<4> vboold4;
+
+ /* 8-wide shortcuts */
+ typedef vfloat<8> vfloat8;
+ typedef vdouble<8> vdouble8;
+ typedef vreal<8> vreal8;
+ typedef vint<8> vint8;
+ typedef vuint<8> vuint8;
+ typedef vllong<8> vllong8;
+ typedef vbool<8> vbool8;
+ typedef vboolf<8> vboolf8;
+ typedef vboold<8> vboold8;
+
+ /* 16-wide shortcuts */
+ typedef vfloat<16> vfloat16;
+ typedef vdouble<16> vdouble16;
+ typedef vreal<16> vreal16;
+ typedef vint<16> vint16;
+ typedef vuint<16> vuint16;
+ typedef vllong<16> vllong16;
+ typedef vbool<16> vbool16;
+ typedef vboolf<16> vboolf16;
+ typedef vboold<16> vboold16;
+
+ /* Default shortcuts */
+ typedef vfloat<VSIZEX> vfloatx;
+ typedef vdouble<VSIZEX> vdoublex;
+ typedef vreal<VSIZEX> vrealx;
+ typedef vint<VSIZEX> vintx;
+ typedef vuint<VSIZEX> vuintx;
+ typedef vllong<VSIZEX> vllongx;
+ typedef vbool<VSIZEX> vboolx;
+ typedef vboolf<VSIZEX> vboolfx;
+ typedef vboold<VSIZEX> vbooldx;
+}
diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h
new file mode 100644
index 0000000000..7db0d1c5c1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide AVX bool type for 64bit data types*/
+ template<>
+ struct vboold<4>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboold4 Bool;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { // data
+ __m256d v;
+ struct { __m128d vl,vh; };
+ long long i[4];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold() {}
+ __forceinline vboold(const vboold4& a) { v = a.v; }
+ __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+ __forceinline vboold(__m256d a) : v(a) {}
+ __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+ __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+ __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+ __forceinline operator const __m256d() const { return v; }
+
+ __forceinline vboold(int a)
+ {
+ assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+ const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+ const __m256i b = _mm256_set1_epi64x(a);
+ const __m256i c = _mm256_and_si256(b,mask);
+ v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+ vl = mm_lookupmask_pd[a & 0x3];
+ vh = mm_lookupmask_pd[a >> 2];
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+ __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+ __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+ __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+ __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+ __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+ __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+ __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+ __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+ __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+ __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+ return _mm256_blendv_pd(f, t, mask);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+ __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+
+
+#if defined(__AVX2__)
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vboold4 shuffle(const vboold4& v) {
+ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i>
+ __forceinline vboold4 shuffle(const vboold4& v) {
+ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+ }
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+ __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+ __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+ __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+ __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+ __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+ __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+ __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+ __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+ __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+ __forceinline void set (vboold4& a, size_t index) { a[index] = -1; }
+ __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+ << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold4_avx512.h b/thirdparty/embree/common/simd/vboold4_avx512.h
new file mode 100644
index 0000000000..ceaad7bba5
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx512.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide AVX-512 bool type */
+ template<>
+ struct vboold<4>
+ {
+ typedef vboold4 Bool;
+ typedef vint4 Int;
+
+ enum { size = 4 }; // number of SIMD elements
+ __mmask8 v; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold() {}
+ __forceinline vboold(const vboold4& t) { v = t.v; }
+ __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+ __forceinline vboold(const __mmask8 &t) { v = t; }
+ __forceinline operator __mmask8() const { return v; }
+
+ __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+ __forceinline vboold(int t) { v = (__mmask8)t; }
+ __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+ /* return int8 mask */
+ __forceinline __m128i mask8() const {
+ return _mm_movm_epi8(v);
+ }
+
+ /* return int32 mask */
+ __forceinline __m128i mask32() const {
+ return _mm_movm_epi32(v);
+ }
+
+ /* return int64 mask */
+ __forceinline __m256i mask64() const {
+ return _mm256_movm_epi64(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold(FalseTy) : v(0x0) {}
+ __forceinline vboold(TrueTy) : v(0xf) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const {
+ assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+ __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+ __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+ __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+ __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+ __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+ __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+ __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+ return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+ __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+ __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+ __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+ __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+ __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+ __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Conversion Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+ __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; }
+ __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+ {
+ cout << "<";
+ for (size_t i=0; i<4; i++) {
+ if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+ }
+ return cout << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold8_avx512.h b/thirdparty/embree/common/simd/vboold8_avx512.h
new file mode 100644
index 0000000000..66d2054872
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold8_avx512.h
@@ -0,0 +1,151 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX-512 bool type */
+ template<>
+ struct vboold<8>
+ {
+ typedef vboold8 Bool;
+ typedef vint8 Int;
+
+ enum { size = 8 }; // number of SIMD elements
+ __mmask8 v; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold() {}
+ __forceinline vboold(const vboold8& t) { v = t.v; }
+ __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+ __forceinline vboold(const __mmask8& t) { v = t; }
+ __forceinline operator __mmask8() const { return v; }
+
+ __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+ __forceinline vboold(int t) { v = (__mmask8)t; }
+ __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+ /* return int8 mask */
+ __forceinline __m128i mask8() const {
+ return _mm_movm_epi8(v);
+ }
+
+ /* return int64 mask */
+ __forceinline __m512i mask64() const {
+ return _mm512_movm_epi64(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold(FalseTy) : v(0x00) {}
+ __forceinline vboold(TrueTy) : v(0xff) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const {
+ assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+ __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+ __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+ __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+ __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+ __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+ __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+
+ __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+ return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+ __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+ __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+ __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+ __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+ __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+ __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Conversion Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+ __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; }
+ __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+ {
+ cout << "<";
+ for (size_t i=0; i<8; i++) {
+ if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+ }
+ return cout << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h
new file mode 100644
index 0000000000..19841dcea8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf16_avx512.h
@@ -0,0 +1,153 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 16-wide AVX-512 bool type */
+ template<>
+ struct vboolf<16>
+ {
+ typedef vboolf16 Bool;
+ typedef vint16 Int;
+ typedef vfloat16 Float;
+
+ enum { size = 16 }; // number of SIMD elements
+ __mmask16 v; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf() {}
+ __forceinline vboolf(const vboolf16& t) { v = t.v; }
+ __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+ __forceinline vboolf(const __mmask16& t) { v = t; }
+ __forceinline operator __mmask16() const { return v; }
+
+ __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+ __forceinline vboolf(int t) { v = (__mmask16)t; }
+ __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+ /* return int8 mask */
+ __forceinline __m128i mask8() const {
+ return _mm_movm_epi8(v);
+ }
+
+ /* return int32 mask */
+ __forceinline __m512i mask32() const {
+ return _mm512_movm_epi32(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf(FalseTy) : v(0x0000) {}
+ __forceinline vboolf(TrueTy) : v(0xffff) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const {
+ assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+ __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+ __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+ __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+ __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+ __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+ __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+
+ __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+ return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; }
+ __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; }
+ __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; }
+
+ __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+ __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+ __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+ __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Convertion Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+ __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+ __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; }
+ __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+ {
+ cout << "<";
+ for (size_t i=0; i<16; i++) {
+ if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+ }
+ return cout << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_avx512.h b/thirdparty/embree/common/simd/vboolf4_avx512.h
new file mode 100644
index 0000000000..e65f66b025
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide AVX-512 bool type */
+ template<>
+ struct vboolf<4>
+ {
+ typedef vboolf4 Bool;
+ typedef vint4 Int;
+
+ enum { size = 4 }; // number of SIMD elements
+ __mmask8 v; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf() {}
+ __forceinline vboolf(const vboolf4& t) { v = t.v; }
+ __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+ __forceinline vboolf(const __mmask8 &t) { v = t; }
+ __forceinline operator __mmask8() const { return v; }
+
+ __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+ __forceinline vboolf(int t) { v = (__mmask8)t; }
+ __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+ __forceinline vboolf(bool a, bool b, bool c, bool d)
+ : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+ /* return int8 mask */
+ __forceinline __m128i mask8() const {
+ return _mm_movm_epi8(v);
+ }
+
+ /* return int32 mask */
+ __forceinline __m128i mask32() const {
+ return _mm_movm_epi32(v);
+ }
+
+ /* return int64 mask */
+ __forceinline __m256i mask64() const {
+ return _mm256_movm_epi64(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf(FalseTy) : v(0x0) {}
+ __forceinline vboolf(TrueTy) : v(0xf) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const {
+ assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+ __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+ __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+ __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+ __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+ __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+ __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+ __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+ return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+ __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+ __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+ __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+ __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+ __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+ __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Conversion Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+ __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; }
+ __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+ {
+ cout << "<";
+ for (size_t i=0; i<4; i++) {
+ if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+ }
+ return cout << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
new file mode 100644
index 0000000000..fa84b1b6ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide SSE bool type */
+ template<>
+ struct vboolf<4>
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef vboolf4 Bool;
+ typedef vint4 Int;
+ typedef vfloat4 Float;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128 v; int i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf() {}
+ __forceinline vboolf(const vboolf4& other) { v = other.v; }
+ __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+ __forceinline vboolf(__m128 input) : v(input) {}
+ __forceinline operator const __m128&() const { return v; }
+ __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+ __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+
+ __forceinline vboolf(bool a)
+ : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+ __forceinline vboolf(bool a, bool b)
+ : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ __forceinline vboolf(bool a, bool b, bool c, bool d)
+ : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+ __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+
+ /* return int32 mask */
+ __forceinline __m128i mask32() const {
+ return _mm_castps_si128(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+ __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+ __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+ __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+ __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+ __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+ __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+ __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+ __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+
+ __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if defined(__SSE4_1__)
+ return _mm_blendv_ps(f, t, m);
+#else
+ return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+ __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vboolf4 shuffle(const vboolf4& v) {
+ return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0>
+ __forceinline vboolf4 shuffle(const vboolf4& v) {
+ return shuffle<i0,i0,i0,i0>(v);
+ }
+
+#if defined(__SSE3__)
+ template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+ template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+ template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__)
+ template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+ template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+ template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+ __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+ __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+ __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+ __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+ __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+ __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+ __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__SSE4_2__)
+ __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+ __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+ __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; }
+ __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h
new file mode 100644
index 0000000000..ba77cc3c5e
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -0,0 +1,202 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX bool type */
+ template<>
+ struct vboolf<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256 v;
+ struct { __m128 vl,vh; };
+ int i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf() {}
+ __forceinline vboolf(const vboolf8& a) { v = a.v; }
+ __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+ __forceinline vboolf(__m256 a) : v(a) {}
+ __forceinline operator const __m256&() const { return v; }
+ __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+ __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+ __forceinline vboolf(int a)
+ {
+ assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+ const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+ const __m256i b = _mm256_set1_epi32(a);
+ const __m256i c = _mm256_and_si256(b,mask);
+ v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+ vl = mm_lookupmask_ps[a & 0xF];
+ vh = mm_lookupmask_ps[a >> 4];
+#endif
+ }
+
+ __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+ __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+ __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+ __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+ __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+ __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+ __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+ /* return int32 mask */
+ __forceinline __m256i mask32() const {
+ return _mm256_castps_si256(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+ __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+ __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+ __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+ __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+ __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+ __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+ __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+ __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+ __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+ __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+ return _mm256_blendv_ps(f, t, mask);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+ __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+ template<int i>
+ __forceinline vboolf8 shuffle(const vboolf8& v) {
+ return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+ }
+
+ template<int i0, int i1>
+ __forceinline vboolf8 shuffle4(const vboolf8& v) {
+ return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+ return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vboolf8 shuffle(const vboolf8& v) {
+ return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+ return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+ template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+ template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+ template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+ template<int i> __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+ template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+ __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+ __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+ __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+ __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+ __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+ __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+ __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+ __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+ __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+ __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; }
+ __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+ << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx512.h b/thirdparty/embree/common/simd/vboolf8_avx512.h
new file mode 100644
index 0000000000..73ff5666e1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX-512 bool type */
+ template<>
+ struct vboolf<8>
+ {
+ typedef vboolf8 Bool;
+ typedef vint8 Int;
+
+ enum { size = 8 }; // number of SIMD elements
+ __mmask8 v; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf() {}
+ __forceinline vboolf(const vboolf8& t) { v = t.v; }
+ __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+ __forceinline vboolf(const __mmask8 &t) { v = t; }
+ __forceinline operator __mmask8() const { return v; }
+
+ __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+ __forceinline vboolf(int t) { v = (__mmask8)t; }
+ __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+ __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+ : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+ /* return int8 mask */
+ __forceinline __m128i mask8() const {
+ return _mm_movm_epi8(v);
+ }
+
+ /* return int32 mask */
+ __forceinline __m256i mask32() const {
+ return _mm256_movm_epi32(v);
+ }
+
+ /* return int64 mask */
+ __forceinline __m512i mask64() const {
+ return _mm512_movm_epi64(v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf(FalseTy) : v(0x00) {}
+ __forceinline vboolf(TrueTy) : v(0xff) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator [](size_t index) const {
+ assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+ __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+ __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+ __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+ __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+ __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+ __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+ __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+ return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reduction Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+ __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+ __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+ __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+ __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+ __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+ __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+ __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Conversion Operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Get/Set Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+ __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; }
+ __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+ {
+ cout << "<";
+ for (size_t i=0; i<8; i++) {
+ if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+ }
+ return cout << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h
new file mode 100644
index 0000000000..55326de7dd
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide AVX 64-bit double type */
+ template<>
+ struct vdouble<4>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboold4 Bool;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { // data
+ __m256d v;
+ double i[4];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble() {}
+ __forceinline vdouble(const vdouble4& t) { v = t.v; }
+ __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+ __forceinline vdouble(const __m256d& t) { v = t; }
+ __forceinline operator __m256d() const { return v; }
+
+ __forceinline vdouble(double i) {
+ v = _mm256_set1_pd(i);
+ }
+
+ __forceinline vdouble(double a, double b, double c, double d) {
+ v = _mm256_set_pd(d,c,b,a);
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+ __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {}
+ __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+ __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+ _mm256_stream_pd(ptr, a);
+ }
+
+ static __forceinline vdouble4 loadu(const double* addr) {
+ return _mm256_loadu_pd(addr);
+ }
+
+ static __forceinline vdouble4 load(const vdouble4* addr) {
+ return _mm256_load_pd((double*)addr);
+ }
+
+ static __forceinline vdouble4 load(const double* addr) {
+ return _mm256_load_pd(addr);
+ }
+
+ static __forceinline void store(double* ptr, const vdouble4& v) {
+ _mm256_store_pd(ptr, v);
+ }
+
+ static __forceinline void storeu(double* ptr, const vdouble4& v) {
+ _mm256_storeu_pd(ptr, v);
+ }
+
+ static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; }
+ __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+ __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); }
+ __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+ __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+ __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+ __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); }
+ __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; }
+
+ __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+ __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); }
+ __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; }
+
+ __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+ __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); }
+ __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; }
+
+ __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+ __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); }
+ __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; }
+
+ __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+ __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); }
+ __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; }
+
+ __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+ __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); }
+ __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; }
+
+ __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+ __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); }
+ __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+ __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+ __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); }
+ __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+ __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+ __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+ __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+ __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+ __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+ __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+ __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+ __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+ __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; }
+
+ __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+ __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; }
+
+ __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+ __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; }
+
+ __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+ __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; }
+
+ __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+ __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+ __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+ __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+ __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+ __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+ __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
+ __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+ __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
+ __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+ __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+ __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
+#endif
+
+ __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); }
+ __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; }
+
+ __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); }
+ __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; }
+
+ __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); }
+ __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; }
+
+ __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); }
+ __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+ __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); }
+ __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; }
+
+ __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); }
+ __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+ __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+ __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+ __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; }
+ __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+ __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; }
+ __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+ __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+ __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+ __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+ __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+ __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+ __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); }
+ __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+ __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); }
+ __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+
+ __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+ return _mm256_mask_blend_pd(m, f, t);
+#else
+ return _mm256_blendv_pd(f, t, m);
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int i0, int i1>
+ __forceinline vdouble4 shuffle(const vdouble4& v) {
+ return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+ }
+
+ template<int i>
+ __forceinline vdouble4 shuffle(const vdouble4& v) {
+ return shuffle<i, i>(v);
+ }
+
+ template<int i0, int i1>
+ __forceinline vdouble4 shuffle2(const vdouble4& v) {
+ return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+ }
+
+ __forceinline double toScalar(const vdouble4& v) {
+ return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+ __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+ __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+ __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+ __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+ __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+ __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+ __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+ __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+ __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+ __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+ __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+ __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Memory load and store operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+ {
+ cout << "<" << v[0];
+ for (size_t i=1; i<4; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble8_avx512.h b/thirdparty/embree/common/simd/vdouble8_avx512.h
new file mode 100644
index 0000000000..98d21bfe4a
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble8_avx512.h
@@ -0,0 +1,351 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX-512 64-bit double type */
+ template<>
+ struct vdouble<8>
+ {
+ ALIGNED_STRUCT_(64);
+
+ typedef vboold8 Bool;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m512d v;
+ double i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble() {}
+ __forceinline vdouble(const vdouble8& t) { v = t.v; }
+ __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+ __forceinline vdouble(const __m512d& t) { v = t; }
+ __forceinline operator __m512d() const { return v; }
+ __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+ __forceinline vdouble(double i) {
+ v = _mm512_set1_pd(i);
+ }
+
+ __forceinline vdouble(double a, double b, double c, double d) {
+ v = _mm512_set4_pd(d,c,b,a);
+ }
+
+ __forceinline vdouble(double a0, double a1, double a2, double a3,
+ double a4, double a5, double a6, double a7)
+ {
+ v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+ __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {}
+ __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+ __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+ _mm512_stream_pd((double*)ptr, a);
+ }
+
+ static __forceinline vdouble8 loadu(const void* addr) {
+ return _mm512_loadu_pd((double*)addr);
+ }
+
+ static __forceinline vdouble8 load(const vdouble8* addr) {
+ return _mm512_load_pd((double*)addr);
+ }
+
+ static __forceinline vdouble8 load(const double* addr) {
+ return _mm512_load_pd(addr);
+ }
+
+ static __forceinline void store(void* ptr, const vdouble8& v) {
+ _mm512_store_pd(ptr, v);
+ }
+
+ static __forceinline void storeu(void* ptr, const vdouble8& v) {
+ _mm512_storeu_pd(ptr, v);
+ }
+
+ static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+ _mm512_mask_storeu_pd(ptr, mask, f);
+ }
+
+ static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+ _mm512_mask_store_pd(addr, mask, v2);
+ }
+
+ static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+ return _mm512_mask_compress_pd(v, mask, v);
+ }
+
+ static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+ return _mm512_mask_compress_pd(a, mask, b);
+ }
+
+ static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; }
+ __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); }
+ __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+ __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+ __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+ __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); }
+ __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; }
+
+ __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+ __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); }
+ __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; }
+
+ __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+ __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); }
+ __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; }
+
+ __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+ __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); }
+ __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; }
+
+ __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+ __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); }
+ __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; }
+
+ __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+ __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); }
+ __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+ __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+ __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+ __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+ __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+ __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+ __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+ __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+ __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+ __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); }
+ __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+ __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+ __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); }
+ __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+ __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+ __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+ __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+ __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+ __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+ __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+ __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+ __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; }
+
+ __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+ __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; }
+
+ __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+ __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; }
+
+ __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+ __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; }
+
+ __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+ __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; }
+
+ __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+ __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); }
+ __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; }
+
+ __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); }
+ __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; }
+
+ __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); }
+ __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; }
+
+ __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); }
+ __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+ __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); }
+ __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; }
+
+ __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); }
+ __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+ __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+ __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+ __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+ return _mm512_mask_or_pd(f,m,t,t);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int i0, int i1>
+ __forceinline vdouble8 shuffle(const vdouble8& v) {
+ return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+ }
+
+ template<int i>
+ __forceinline vdouble8 shuffle(const vdouble8& v) {
+ return shuffle<i, i>(v);
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vdouble8 shuffle(const vdouble8& v) {
+ return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vdouble8 shuffle4(const vdouble8& v) {
+ return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+ }
+
+ template<int i>
+ __forceinline vdouble8 shuffle4(const vdouble8& v) {
+ return shuffle4<i, i>(v);
+ }
+
+ template<int i>
+ __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+ return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+ }
+
+ __forceinline double toScalar(const vdouble8& v) {
+ return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); }
+ __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+ __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+ __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+ __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+ __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+ __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+ __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Memory load and store operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+ return _mm512_permutexvar_pd(index, v);
+ }
+
+ __forceinline vdouble8 reverse(const vdouble8& a) {
+ return permute(a, vllong8(reverse_step));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+ {
+ cout << "<" << v[0];
+ for (size_t i=1; i<8; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h
new file mode 100644
index 0000000000..9f1e2459c4
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -0,0 +1,615 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 16-wide AVX-512 float type */
+ template<>
+ struct vfloat<16>
+ {
+ ALIGNED_STRUCT_(64);
+
+ typedef vboolf16 Bool;
+ typedef vint16 Int;
+ typedef vfloat16 Float;
+
+ enum { size = 16 }; // number of SIMD elements
+ union { // data
+ __m512 v;
+ float f[16];
+ int i[16];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat() {}
+ __forceinline vfloat(const vfloat16& t) { v = t; }
+ __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+ __forceinline vfloat(const __m512& t) { v = t; }
+ __forceinline operator __m512() const { return v; }
+ __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+ __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+ __forceinline vfloat(float f) {
+ v = _mm512_set1_ps(f);
+ }
+
+ __forceinline vfloat(float a, float b, float c, float d) {
+ v = _mm512_set4_ps(a, b, c, d);
+ }
+
+ __forceinline vfloat(const vfloat4& i) {
+ v = _mm512_broadcast_f32x4(i);
+ }
+
+ __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+ v = _mm512_castps128_ps512(a);
+ v = _mm512_insertf32x4(v, b, 1);
+ v = _mm512_insertf32x4(v, c, 2);
+ v = _mm512_insertf32x4(v, d, 3);
+ }
+
+ __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+ v = _mm512_broadcast_f32x4(a);
+ v = _mm512_mask_broadcast_f32x4(v,mask,b);
+ }
+
+ __forceinline vfloat(const vfloat8& i) {
+ v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+ }
+
+ __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+ v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+ v = _mm512_insertf32x8(v, b, 1);
+#else
+ v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+ }
+
+ /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+ /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+ __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+ aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+ v = _mm512_castpd_ps(aa);
+ }*/
+
+ __forceinline explicit vfloat(const vint16& a) {
+ v = _mm512_cvtepi32_ps(a);
+ }
+
+ __forceinline explicit vfloat(const vuint16& a) {
+ v = _mm512_cvtepu32_ps(a);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {}
+ __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {}
+ __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+ __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+ __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+ __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {}
+ __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); }
+ static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+ static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+ static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+ static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+ static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+ static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+ static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+ _mm512_stream_ps((float*)ptr,a);
+ }
+
+ static __forceinline vfloat16 broadcast(const float* f) {
+ return _mm512_set1_ps(*f);
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+ return _mm512_i32gather_ps(index, ptr, scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+ vfloat16 r = zero;
+ return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+ _mm512_i32scatter_ps(ptr, index, v, scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+ _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; }
+ __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); }
+ __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+ __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+ __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); }
+ __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); }
+
+ __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+ __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+ __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+ __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+ __forceinline vfloat16 rcp(const vfloat16& a) {
+ const vfloat16 r = _mm512_rcp14_ps(a);
+ return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+ }
+
+ __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+ __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+ __forceinline vfloat16 rsqrt(const vfloat16& a)
+ {
+ const vfloat16 r = _mm512_rsqrt14_ps(a);
+ return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+ _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+ __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); }
+ __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; }
+
+ __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+ __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); }
+ __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; }
+
+ __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+ __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); }
+ __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; }
+
+ __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+ __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); }
+ __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; }
+
+ __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+ __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+ __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+ return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+ }
+
+ __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b); }
+ __forceinline vfloat16 min(const vfloat16& a, float b) { return _mm512_min_ps(a,vfloat16(b)); }
+ __forceinline vfloat16 min(const float& a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
+
+ __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
+ __forceinline vfloat16 max(const vfloat16& a, float b) { return _mm512_max_ps(a,vfloat16(b)); }
+ __forceinline vfloat16 max(const float& a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
+
+ __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+ const vint16 ai = _mm512_castps_si512(a);
+ const vint16 bi = _mm512_castps_si512(b);
+ const vint16 ci = _mm512_min_epi32(ai,bi);
+ return _mm512_castsi512_ps(ci);
+ }
+
+ __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+ const vint16 ai = _mm512_castps_si512(a);
+ const vint16 bi = _mm512_castps_si512(b);
+ const vint16 ci = _mm512_max_epi32(ai,bi);
+ return _mm512_castsi512_ps(ci);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+ __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+ __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+ __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+ __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; }
+
+ __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+ __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; }
+
+ __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+ __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; }
+
+ __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+ __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); }
+ __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; }
+
+ __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); }
+ __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; }
+
+ __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); }
+ __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; }
+
+ __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); }
+ __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+ __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); }
+ __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; }
+
+ __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); }
+ __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+ __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+ __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+
+ __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+ return _mm512_mask_blend_ps(s, f, t);
+ }
+
+ __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+ return madd(t,b-a,a);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 floor(const vfloat16& a) {
+ return _mm512_floor_ps(a);
+ }
+ __forceinline vfloat16 ceil (const vfloat16& a) {
+ return _mm512_ceil_ps(a);
+ }
+ __forceinline vfloat16 round (const vfloat16& a) {
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ }
+ __forceinline vint16 floori (const vfloat16& a) {
+ return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+ __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+ template<int i>
+ __forceinline vfloat16 shuffle(const vfloat16& v) {
+ return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat16 shuffle(const vfloat16& v) {
+ return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i>
+ __forceinline vfloat16 shuffle4(const vfloat16& v) {
+ return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat16 shuffle4(const vfloat16& v) {
+ return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+ return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+ }
+
+ __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+ return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+ }
+
+ __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+ return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+ }
+
+ __forceinline vfloat16 reverse(const vfloat16& v) {
+ return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+ }
+
+ template<int i>
+ __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+ return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i));
+ };
+
+ template<int i>
+ __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+ return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i));
+ };
+
+ __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+ vfloat16 z = zero;
+ return mask_align_shift_right<15>(0xfffe,z,a,a);
+ }
+
+ __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+ return align_shift_right<1>(zero,x);
+ }
+
+ __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+ template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+ template<int N, int i>
+ vfloat<N> extractN(const vfloat16& v);
+
+ template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); }
+ template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+ template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+ template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+ template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); }
+ template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+ template<int i> __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+ template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); }
+
+ template<int i> __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+ template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Transpose
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+ vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+ {
+ vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+ vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+ vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+ vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+ c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+ c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+ c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+ c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+ }
+
+ __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3,
+ const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+ const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11,
+ const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+ vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+ {
+ return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+ c0, c1, c2, c3);
+ }
+
+ __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+ const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+ vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+ vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+ {
+ vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+ transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+ vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+ transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+ c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+ c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+ c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+ c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+ c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+ c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+ c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+ c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+ }
+
+ __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3,
+ const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+ const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11,
+ const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+ vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+ vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+ {
+ return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11),
+ vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+ c0, c1, c2, c3, c4, c5, c6, c7);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); }
+ __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+ __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+ __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+ __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+ __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+ __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+
+ __forceinline size_t select_min(const vfloat16& v) {
+ return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+ }
+
+ __forceinline size_t select_max(const vfloat16& v) {
+ return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+ }
+
+ __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v)
+ {
+ const vfloat16 a = select(valid,v,vfloat16(pos_inf));
+ const vbool16 valid_min = valid & (a == vreduce_min(a));
+ return bsf(movemask(any(valid_min) ? valid_min : valid));
+ }
+
+ __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v)
+ {
+ const vfloat16 a = select(valid,v,vfloat16(neg_inf));
+ const vbool16 valid_max = valid & (a == vreduce_max(a));
+ return bsf(movemask(any(valid_max) ? valid_max : valid));
+ }
+
+ __forceinline vfloat16 prefix_sum(const vfloat16& a)
+ {
+ const vfloat16 z(zero);
+ vfloat16 v = a;
+ v = v + align_shift_right<16-1>(v,z);
+ v = v + align_shift_right<16-2>(v,z);
+ v = v + align_shift_right<16-4>(v,z);
+ v = v + align_shift_right<16-8>(v,z);
+ return v;
+ }
+
+ __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a)
+ {
+ const vfloat16 z(zero);
+ vfloat16 v = a;
+ v = v + align_shift_right<1>(z,v);
+ v = v + align_shift_right<2>(z,v);
+ v = v + align_shift_right<4>(z,v);
+ v = v + align_shift_right<8>(z,v);
+ return v;
+ }
+
+ __forceinline vfloat16 prefix_min(const vfloat16& a)
+ {
+ const vfloat16 z(pos_inf);
+ vfloat16 v = a;
+ v = min(v,align_shift_right<16-1>(v,z));
+ v = min(v,align_shift_right<16-2>(v,z));
+ v = min(v,align_shift_right<16-4>(v,z));
+ v = min(v,align_shift_right<16-8>(v,z));
+ return v;
+ }
+
+ __forceinline vfloat16 prefix_max(const vfloat16& a)
+ {
+ const vfloat16 z(neg_inf);
+ vfloat16 v = a;
+ v = max(v,align_shift_right<16-1>(v,z));
+ v = max(v,align_shift_right<16-2>(v,z));
+ v = max(v,align_shift_right<16-4>(v,z));
+ v = max(v,align_shift_right<16-8>(v,z));
+ return v;
+ }
+
+
+ __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+ {
+ const vfloat16 z(pos_inf);
+ vfloat16 v = a;
+ v = min(v,align_shift_right<1>(z,v));
+ v = min(v,align_shift_right<2>(z,v));
+ v = min(v,align_shift_right<4>(z,v));
+ v = min(v,align_shift_right<8>(z,v));
+ return v;
+ }
+
+ __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+ {
+ const vfloat16 z(neg_inf);
+ vfloat16 v = a;
+ v = max(v,align_shift_right<1>(z,v));
+ v = max(v,align_shift_right<2>(z,v));
+ v = max(v,align_shift_right<4>(z,v));
+ v = max(v,align_shift_right<8>(z,v));
+ return v;
+ }
+
+ __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+ return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+ {
+ cout << "<" << v[0];
+ for (int i=1; i<16; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
new file mode 100644
index 0000000000..5215bf9730
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -0,0 +1,722 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide SSE float type */
+ template<>
+ struct vfloat<4>
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef vboolf4 Bool;
+ typedef vint4 Int;
+ typedef vfloat4 Float;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128 v; float f[4]; int i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat() {}
+ __forceinline vfloat(const vfloat4& other) { v = other.v; }
+ __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+ __forceinline vfloat(__m128 a) : v(a) {}
+ __forceinline operator const __m128&() const { return v; }
+ __forceinline operator __m128&() { return v; }
+
+ __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+ __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+ __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+ __forceinline explicit vfloat(const vuint4& x) {
+ const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+ const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
+ const __m128 af = _mm_cvtepi32_ps(a);
+ const __m128 bf = _mm_castsi128_ps(b);
+ v = _mm_add_ps(af,bf);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {}
+ __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {}
+ __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+ __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+ __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+ __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {}
+ __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+ static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+ static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+ static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+ static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+ static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+ static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+ static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+ static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+ static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+ static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+ static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+ return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+ return _mm_load_ps(ptr);
+#endif
+ }
+
+#if defined(__SSE4_1__)
+ static __forceinline vfloat4 load(const char* ptr) {
+ return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+ }
+#else
+ static __forceinline vfloat4 load(const char* ptr) {
+ return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ static __forceinline vfloat4 load(const unsigned char* ptr) {
+ return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+ }
+#else
+ static __forceinline vfloat4 load(const unsigned char* ptr) {
+ //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+ return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+ }
+#endif
+
+#if defined(__SSE4_1__)
+ static __forceinline vfloat4 load(const short* ptr) {
+ return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+ }
+#else
+ static __forceinline vfloat4 load(const short* ptr) {
+ return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+ }
+#endif
+
+ static __forceinline vfloat4 load(const unsigned short* ptr) {
+ return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+ }
+
+ static __forceinline void store_nt(void* ptr, const vfloat4& v)
+ {
+#if defined (__SSE4_1__)
+ _mm_stream_ps((float*)ptr,v);
+#else
+ _mm_store_ps((float*)ptr,v);
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__)
+ return _mm_i32gather_ps(ptr, index, scale);
+#else
+ return vfloat4(
+ *(float*)(((char*)ptr)+scale*index[0]),
+ *(float*)(((char*)ptr)+scale*index[1]),
+ *(float*)(((char*)ptr)+scale*index[2]),
+ *(float*)(((char*)ptr)+scale*index[3]));
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+ vfloat4 r = zero;
+#if defined(__AVX512VL__)
+ return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+ return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+ if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+ return r;
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+ {
+#if defined(__AVX512VL__)
+ _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+ *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+ *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+ *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+ *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+ {
+#if defined(__AVX512VL__)
+ _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+ if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+ if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+ if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+ if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+ }
+
+ static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
+ scatter<1>(mask,ptr,ofs,v);
+ }
+ static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+ scatter<4>(mask,ptr,ofs,v);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+ __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; }
+
+ friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+ return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__)
+ return _mm_blendv_ps(f, t, m);
+#else
+ return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Load/Store
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<> struct mem<vfloat4>
+ {
+ static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); }
+ static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); }
+ __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); }
+ __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+ __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); }
+ __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); }
+
+ __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+ __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+ __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#if defined(__AVX512VL__)
+ __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+ __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+ __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+ __forceinline vfloat4 rcp(const vfloat4& a)
+ {
+#if defined(__AVX512VL__)
+ const vfloat4 r = _mm_rcp14_ps(a);
+#else
+ const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+ return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+ return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+#endif
+ }
+ __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+ __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+ __forceinline vfloat4 rsqrt(const vfloat4& a)
+ {
+#if defined(__AVX512VL__)
+ vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+ vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__ARM_NEON)
+ r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+ r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#elif defined(__AVX2__)
+ r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+ r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+ return r;
+ }
+
+ __forceinline vboolf4 isnan(const vfloat4& a) {
+ const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#if defined(__AVX512VL__)
+ return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+ return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+ __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); }
+ __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; }
+
+ __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+ __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); }
+ __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; }
+
+ __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+ __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); }
+ __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; }
+
+ __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+ __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); }
+ __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; }
+
+ __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+ __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+ __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+ __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+ __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+ __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); }
+ __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+ __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+ __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); }
+ __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__)
+ __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_min_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+
+ __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_max_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+
+ __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_min_epu32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+
+ __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+ const vint4 ai = _mm_castps_si128(a);
+ const vint4 bi = _mm_castps_si128(b);
+ const vint4 ci = _mm_max_epu32(ai,bi);
+ return _mm_castsi128_ps(ci);
+ }
+#else
+ __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+ return min(a,b);
+ }
+
+ __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+ return max(a,b);
+ }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+ __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+ __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+ __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+ __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+ __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+ __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+ __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+ __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+ __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; }
+
+ __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+ __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; }
+
+ __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+ __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; }
+
+ __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+ __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+ __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+ __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+ __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+ __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+ __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+ __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+ __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+ __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+ __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+ __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+ __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); }
+ __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; }
+
+ __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); }
+ __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; }
+
+ __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); }
+ __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; }
+
+ __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); }
+ __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+ __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); }
+ __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; }
+
+ __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); }
+ __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+ __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+ __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+ __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; }
+ __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+ __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; }
+ __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+ __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+ __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+ __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+ __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+ __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); }
+ __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+ __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); }
+ __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+ template<int mask>
+ __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+ {
+#if defined(__SSE4_1__)
+ return _mm_blend_ps(f, t, mask);
+#else
+ return select(vboolf4(mask), t, f);
+#endif
+ }
+
+ __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+ return madd(t,b-a,a);
+ }
+
+ __forceinline bool isvalid(const vfloat4& v) {
+ return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite(const vfloat4& a) {
+ return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+ }
+
+ __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+ return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+ __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
+ __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
+ __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
+ __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
+#elif defined (__SSE4_1__)
+ __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+ __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+ __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); }
+ __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+ __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+ __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+ __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+ __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+ __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+ __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__SSE4_1__)
+ return vint4(floor(a));
+#else
+ return vint4(a-vfloat4(0.5f));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+ __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat4 shuffle(const vfloat4& v) {
+ return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+#if defined(__SSE3__)
+ template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+ template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+ template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+ template<int i>
+ __forceinline vfloat4 shuffle(const vfloat4& v) {
+ return shuffle<i,i,i,i>(v);
+ }
+
+ template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
+ template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+
+#if defined (__SSE4_1__)
+ template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+ template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+ template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+ template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+ template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+ __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+
+ __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+ return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4));
+ }
+
+#if defined (__AVX2__)
+ __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+ return _mm_permutevar_ps(a,index);
+ }
+
+ __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+ template<int i>
+ __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+ return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+ }
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Sorting Network
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4 sort_ascending(const vfloat4& v)
+ {
+ const vfloat4 a0 = v;
+ const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+ const vfloat4 c0 = min(a0,b0);
+ const vfloat4 d0 = max(a0,b0);
+ const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+ const vfloat4 c1 = min(a1,b1);
+ const vfloat4 d1 = max(a1,b1);
+ const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+ const vfloat4 c2 = min(a2,b2);
+ const vfloat4 d2 = max(a2,b2);
+ const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3;
+ }
+
+ __forceinline vfloat4 sort_descending(const vfloat4& v)
+ {
+ const vfloat4 a0 = v;
+ const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+ const vfloat4 c0 = max(a0,b0);
+ const vfloat4 d0 = min(a0,b0);
+ const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+ const vfloat4 c1 = max(a1,b1);
+ const vfloat4 d1 = min(a1,b1);
+ const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+ const vfloat4 c2 = max(a2,b2);
+ const vfloat4 d2 = min(a2,b2);
+ const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Transpose
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+ {
+ vfloat4 l02 = unpacklo(r0,r2);
+ vfloat4 h02 = unpackhi(r0,r2);
+ vfloat4 l13 = unpacklo(r1,r3);
+ vfloat4 h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ c3 = unpackhi(h02,h13);
+ }
+
+ __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+ {
+ vfloat4 l02 = unpacklo(r0,r2);
+ vfloat4 h02 = unpackhi(r0,r2);
+ vfloat4 l13 = unpacklo(r1,r3);
+ vfloat4 h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+ __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+ __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
+
+ __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+ __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+ __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
+ {
+ const vfloat4 a = select(valid,v,vfloat4(pos_inf));
+ const vbool4 valid_min = valid & (a == vreduce_min(a));
+ return bsf(movemask(any(valid_min) ? valid_min : valid));
+ }
+ __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v)
+ {
+ const vfloat4 a = select(valid,v,vfloat4(neg_inf));
+ const vbool4 valid_max = valid & (a == vreduce_max(a));
+ return bsf(movemask(any(valid_max) ? valid_max : valid));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+ return reduce_add(a*b);
+ }
+
+ __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+ {
+ const vfloat4 a0 = a;
+ const vfloat4 b0 = shuffle<1,2,0,3>(b);
+ const vfloat4 a1 = shuffle<1,2,0,3>(a);
+ const vfloat4 b1 = b;
+ return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+ }
+
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h
new file mode 100644
index 0000000000..13446454e8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -0,0 +1,758 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX float type */
+ template<>
+ struct vfloat<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { __m256 v; float f[8]; int i[8]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat() {}
+ __forceinline vfloat(const vfloat8& other) { v = other.v; }
+ __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+ __forceinline vfloat(__m256 a) : v(a) {}
+ __forceinline operator const __m256&() const { return v; }
+ __forceinline operator __m256&() { return v; }
+
+ __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+ __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+ __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
+ __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+ __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+ __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+ __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {}
+ __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {}
+ __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+ __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+ __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+ __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {}
+ __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vfloat8 broadcast(const void* a) {
+ return _mm256_broadcast_ss((float*)a);
+ }
+
+ static __forceinline vfloat8 load(const char* ptr) {
+#if defined(__AVX2__)
+ return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+ return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+ }
+
+ static __forceinline vfloat8 load(const unsigned char* ptr) {
+#if defined(__AVX2__)
+ return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+ return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+ }
+
+ static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+ return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+ return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+ }
+
+ static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+ static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+ static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+ static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+ static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#else
+ static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+ static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+#endif
+
+#if defined(__AVX2__)
+ static __forceinline vfloat8 load_nt(void* ptr) {
+ return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+ }
+#endif
+
+ static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+ _mm256_stream_ps((float*)ptr,v);
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__)
+ return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+ return vfloat8(
+ *(float*)(((char*)ptr)+scale*index[0]),
+ *(float*)(((char*)ptr)+scale*index[1]),
+ *(float*)(((char*)ptr)+scale*index[2]),
+ *(float*)(((char*)ptr)+scale*index[3]),
+ *(float*)(((char*)ptr)+scale*index[4]),
+ *(float*)(((char*)ptr)+scale*index[5]),
+ *(float*)(((char*)ptr)+scale*index[6]),
+ *(float*)(((char*)ptr)+scale*index[7]));
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+ vfloat8 r = zero;
+#if defined(__AVX512VL__)
+ return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+ return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+ if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+ if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
+ if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
+ if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
+ if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
+ return r;
+ #endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+ *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+ if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+ __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; }
+ };
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); }
+ __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+ __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); }
+ __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); }
+
+ __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+ __forceinline vfloat8 operator -(const vfloat8& a) {
+ const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+ return _mm256_xor_ps(a, mask);
+ }
+ __forceinline vfloat8 abs(const vfloat8& a) {
+ const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+ return _mm256_and_ps(a, mask);
+ }
+ __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+ __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+ static __forceinline vfloat8 rcp(const vfloat8& a)
+ {
+#if defined(__AVX512VL__)
+ const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+ const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+ return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+#else
+ return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+#endif
+ }
+ __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+ __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+ static __forceinline vfloat8 rsqrt(const vfloat8& a)
+ {
+#if defined(__AVX512VL__)
+ const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+ const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+ return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+ _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#else
+ return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+ _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+ __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); }
+ __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; }
+
+ __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+ __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); }
+ __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; }
+
+ __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+ __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); }
+ __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; }
+
+ __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+ __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); }
+ __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; }
+
+ __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+ __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+ __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+ __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+ __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+ __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); }
+ __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+ __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+ __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); }
+ __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+ /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+ static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+ const vint8 ai = _mm256_castps_si256(a);
+ const vint8 bi = _mm256_castps_si256(b);
+ const vint8 ci = _mm256_min_epi32(ai,bi);
+ return _mm256_castsi256_ps(ci);
+ }
+
+ static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+ const vint8 ai = _mm256_castps_si256(a);
+ const vint8 bi = _mm256_castps_si256(b);
+ const vint8 ci = _mm256_max_epi32(ai,bi);
+ return _mm256_castsi256_ps(ci);
+ }
+
+ static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+ const vint8 ai = _mm256_castps_si256(a);
+ const vint8 bi = _mm256_castps_si256(b);
+ const vint8 ci = _mm256_min_epu32(ai,bi);
+ return _mm256_castsi256_ps(ci);
+ }
+
+ static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+ const vint8 ai = _mm256_castps_si256(a);
+ const vint8 bi = _mm256_castps_si256(b);
+ const vint8 ci = _mm256_max_epu32(ai,bi);
+ return _mm256_castsi256_ps(ci);
+ }
+
+#else
+
+ static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+ return asFloat(min(asInt(a),asInt(b)));
+ }
+
+ static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+ return asFloat(max(asInt(a),asInt(b)));
+ }
+
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Ternary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+ static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+ static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+ static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+ static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+ static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+ static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+ static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+ static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+ __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; }
+
+ __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+ __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; }
+
+ __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+ __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; }
+
+ __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+ __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+ static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+ static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+ static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+ static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+ static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+ static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+ return _mm256_mask_blend_ps(m, f, t);
+ }
+#else
+ static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
+ static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+ static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
+ static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+ static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+ static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
+
+ static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+ return _mm256_blendv_ps(f, t, m);
+ }
+#endif
+
+ template<int mask>
+ __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+ return _mm256_blend_ps(f, t, mask);
+ }
+
+ __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); }
+ __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; }
+
+ __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); }
+ __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; }
+
+ __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); }
+ __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; }
+
+ __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); }
+ __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+ __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); }
+ __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; }
+
+ __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); }
+ __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+ __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+ __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+ __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; }
+ __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+ __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; }
+ __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+ static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+ static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+ static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+ static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+ static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+ static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+ static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); }
+ static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+ static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); }
+ static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+ __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+ return madd(t,b-a,a);
+ }
+
+ __forceinline bool isvalid (const vfloat8& v) {
+ return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+ }
+
+ __forceinline bool is_finite (const vfloat8& a) {
+ return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+ }
+
+ __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+ return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Rounding Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+ __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+ __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); }
+ __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+ __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+ __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+ template<int i>
+ __forceinline vfloat8 shuffle(const vfloat8& v) {
+ return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+ }
+
+ template<int i0, int i1>
+ __forceinline vfloat8 shuffle4(const vfloat8& v) {
+ return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+ return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat8 shuffle(const vfloat8& v) {
+ return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+ return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+ template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+ template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+ __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+ template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+ template<size_t i> __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+ template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); }
+
+ __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+#if defined (__AVX2__)
+ static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+ return _mm256_permutevar8x32_ps(a, index);
+ }
+#endif
+
+#if defined(__AVX512VL__)
+ template<int i>
+ static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+ return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+ }
+#endif
+
+#if defined (__AVX_I__)
+ template<const int mode>
+ static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+ return _mm256_cvtps_ph(a, mode);
+ }
+
+ static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+ return _mm256_cvtph_ps(a);
+ }
+#endif
+
+#if defined(__AVX512VL__)
+ static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+ return align_shift_right<1>(zero,x);
+ }
+#else
+ static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+ const vfloat8 t0 = shuffle<1,2,3,0>(x);
+ const vfloat8 t1 = shuffle4<1,0>(t0);
+ return _mm256_blend_ps(t0,t1,0x88);
+ }
+#endif
+
+ __forceinline vint8 floori(const vfloat8& a) {
+ return vint8(floor(a));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Transpose
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+ {
+ vfloat8 l02 = unpacklo(r0,r2);
+ vfloat8 h02 = unpackhi(r0,r2);
+ vfloat8 l13 = unpacklo(r1,r3);
+ vfloat8 h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ c3 = unpackhi(h02,h13);
+ }
+
+ __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+ {
+ vfloat8 l02 = unpacklo(r0,r2);
+ vfloat8 h02 = unpackhi(r0,r2);
+ vfloat8 l13 = unpacklo(r1,r3);
+ vfloat8 h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ }
+
+ __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+ vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+ {
+ vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+ vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+ c0 = shuffle4<0,2>(h0,h4);
+ c1 = shuffle4<0,2>(h1,h5);
+ c2 = shuffle4<0,2>(h2,h6);
+ c3 = shuffle4<0,2>(h3,h7);
+ c4 = shuffle4<1,3>(h0,h4);
+ c5 = shuffle4<1,3>(h1,h5);
+ c6 = shuffle4<1,3>(h2,h6);
+ c7 = shuffle4<1,3>(h3,h7);
+ }
+
+ __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+ vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+ {
+ transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+ }
+
+ __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+ vfloat8& c0, vfloat8& c1, vfloat8& c2)
+ {
+ transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+ __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+ __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+ __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+ __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v)
+ {
+ const vfloat8 a = select(valid,v,vfloat8(pos_inf));
+ const vbool8 valid_min = valid & (a == vreduce_min(a));
+ return bsf(movemask(any(valid_min) ? valid_min : valid));
+ }
+
+ __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v)
+ {
+ const vfloat8 a = select(valid,v,vfloat8(neg_inf));
+ const vbool8 valid_max = valid & (a == vreduce_max(a));
+ return bsf(movemask(any(valid_max) ? valid_max : valid));
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Euclidian Space Operators (pairs of Vec3fa's)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+ // return vreduce_add4(a*b);
+ //}
+
+ __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+ return _mm256_dp_ps(a,b,0x7F);
+ }
+
+ __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+ {
+ const vfloat8 a0 = a;
+ const vfloat8 b0 = shuffle<1,2,0,3>(b);
+ const vfloat8 a1 = shuffle<1,2,0,3>(a);
+ const vfloat8 b1 = b;
+ return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+ }
+
+ //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+ //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+ //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+ //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+ __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+ //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+ //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+ //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+ //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+ //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+ // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+ //}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// In Register Sorting
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vfloat8 sort_ascending(const vfloat8& v)
+ {
+ const vfloat8 a0 = v;
+ const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+ const vfloat8 c0 = min(a0,b0);
+ const vfloat8 d0 = max(a0,b0);
+ const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+ const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+ const vfloat8 c1 = min(a1,b1);
+ const vfloat8 d1 = max(a1,b1);
+ const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+ const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+ const vfloat8 c2 = min(a2,b2);
+ const vfloat8 d2 = max(a2,b2);
+ const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+ const vfloat8 b3 = shuffle4<1,0>(a3);
+ const vfloat8 c3 = min(a3,b3);
+ const vfloat8 d3 = max(a3,b3);
+ const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+ const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+ const vfloat8 c4 = min(a4,b4);
+ const vfloat8 d4 = max(a4,b4);
+ const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+ const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+ const vfloat8 c5 = min(a5,b5);
+ const vfloat8 d5 = max(a5,b5);
+ const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+ return a6;
+ }
+
+ __forceinline vfloat8 sort_descending(const vfloat8& v)
+ {
+ const vfloat8 a0 = v;
+ const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+ const vfloat8 c0 = max(a0,b0);
+ const vfloat8 d0 = min(a0,b0);
+ const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+ const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+ const vfloat8 c1 = max(a1,b1);
+ const vfloat8 d1 = min(a1,b1);
+ const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+ const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+ const vfloat8 c2 = max(a2,b2);
+ const vfloat8 d2 = min(a2,b2);
+ const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+ const vfloat8 b3 = shuffle4<1,0>(a3);
+ const vfloat8 c3 = max(a3,b3);
+ const vfloat8 d3 = min(a3,b3);
+ const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+ const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+ const vfloat8 c4 = max(a4,b4);
+ const vfloat8 d4 = min(a4,b4);
+ const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+ const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+ const vfloat8 c5 = max(a5,b5);
+ const vfloat8 d5 = min(a5,b5);
+ const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+ return a6;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h
new file mode 100644
index 0000000000..3720c3c9d6
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint16_avx512.h
@@ -0,0 +1,472 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 16-wide AVX-512 integer type */
+ template<>
+ struct vint<16>
+ {
+ ALIGNED_STRUCT_(64);
+
+ typedef vboolf16 Bool;
+ typedef vint16 Int;
+ typedef vfloat16 Float;
+
+ enum { size = 16 }; // number of SIMD elements
+ union { // data
+ __m512i v;
+ int i[16];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint() {}
+ __forceinline vint(const vint16& t) { v = t.v; }
+ __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+ __forceinline vint(const __m512i& t) { v = t; }
+ __forceinline operator __m512i() const { return v; }
+ __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+ __forceinline vint(int i) {
+ v = _mm512_set1_epi32(i);
+ }
+
+ __forceinline vint(int a, int b, int c, int d) {
+ v = _mm512_set4_epi32(d,c,b,a);
+ }
+
+ __forceinline vint(int a0 , int a1 , int a2 , int a3,
+ int a4 , int a5 , int a6 , int a7,
+ int a8 , int a9 , int a10, int a11,
+ int a12, int a13, int a14, int a15)
+ {
+ v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+ }
+
+ __forceinline vint(const vint4& i) {
+ v = _mm512_broadcast_i32x4(i);
+ }
+
+ __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+ v = _mm512_castsi128_si512(a);
+ v = _mm512_inserti32x4(v, b, 1);
+ v = _mm512_inserti32x4(v, c, 2);
+ v = _mm512_inserti32x4(v, d, 3);
+ }
+
+ __forceinline vint(const vint8& i) {
+ v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+ }
+
+ __forceinline vint(const vint8& a, const vint8& b) {
+ v = _mm512_castsi256_si512(a);
+ v = _mm512_inserti64x4(v, b, 1);
+ }
+
+ __forceinline explicit vint(const __m512& f) {
+ v = _mm512_cvtps_epi32(f);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+ __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {}
+ __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+ __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+ __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+ __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+ static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+ static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+ static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+ static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+ static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+ static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+ static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+ static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+ static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+
+ static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+ static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+ static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+ static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+ return _mm512_mask_compress_epi32(v,mask,v);
+ }
+
+ static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+ return _mm512_mask_compress_epi32(a,mask,b);
+ }
+
+ static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+ return _mm512_mask_expand_epi32(b,mask,a);
+ }
+
+ template<int scale = 4>
+ static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+ return _mm512_i32gather_epi32(index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+ return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+ return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+ _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+ _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; }
+ __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+ __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+ __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+ __forceinline vint16 operator +(const vint16& a) { return a; }
+ __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+ __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); }
+ __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; }
+
+ __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+ __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); }
+ __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; }
+
+ __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+ __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); }
+ __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; }
+
+ __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+ __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); }
+ __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; }
+
+ __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+ __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); }
+ __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; }
+
+ __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+ __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); }
+ __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; }
+
+ __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+ __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+ __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+ __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+ __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+ __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+ __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+
+ __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+ __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); }
+ __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); }
+
+ __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+ __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); }
+ __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); }
+
+ __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+ __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+ __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+ __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+ __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+ __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+ __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; }
+
+ __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+ __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; }
+
+ __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+ __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; }
+
+ __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+ __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; }
+
+ __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+ __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; }
+
+ __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+ __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); }
+ __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; }
+
+ __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); }
+ __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; }
+
+ __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); }
+ __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; }
+
+ __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); }
+ __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; }
+
+ __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); }
+ __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; }
+
+ __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); }
+ __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; }
+
+ __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+ __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+
+
+ __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+ return _mm512_mask_or_epi32(f,m,t,t);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+ __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+ template<int i>
+ __forceinline vint16 shuffle(const vint16& v) {
+ return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint16 shuffle(const vint16& v) {
+ return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i>
+ __forceinline vint16 shuffle4(const vint16& v) {
+ return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint16 shuffle4(const vint16& v) {
+ return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i>
+ __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+ return _mm512_alignr_epi32(a, b, i);
+ };
+
+ __forceinline int toScalar(const vint16& v) {
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+ }
+
+ template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+ template<int N, int i>
+ vint<N> extractN(const vint16& v);
+
+ template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); }
+ template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+ template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+ template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+ template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); }
+ template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+ template<int i> __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+ template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); }
+
+ template<int i> __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+ template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+ __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+ __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+ __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+ __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+ __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+ __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+ __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+ __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+ __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+ __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Memory load and store operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint16 conflict(const vint16& index)
+ {
+ return _mm512_conflict_epi32(index);
+ }
+
+ __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+ {
+ return _mm512_mask_conflict_epi32(dest,mask,index);
+ }
+
+ __forceinline vint16 convert_uint32_t(const __m512& f) {
+ return _mm512_cvtps_epu32(f);
+ }
+
+ __forceinline vint16 permute(vint16 v, vint16 index) {
+ return _mm512_permutexvar_epi32(index,v);
+ }
+
+ __forceinline vint16 reverse(const vint16 &a) {
+ return permute(a,vint16(reverse_step));
+ }
+
+ __forceinline vint16 prefix_sum(const vint16& a)
+ {
+ const vint16 z(zero);
+ vint16 v = a;
+ v = v + align_shift_right<16-1>(v,z);
+ v = v + align_shift_right<16-2>(v,z);
+ v = v + align_shift_right<16-4>(v,z);
+ v = v + align_shift_right<16-8>(v,z);
+ return v;
+ }
+
+ __forceinline vint16 reverse_prefix_sum(const vint16& a)
+ {
+ const vint16 z(zero);
+ vint16 v = a;
+ v = v + align_shift_right<1>(z,v);
+ v = v + align_shift_right<2>(z,v);
+ v = v + align_shift_right<4>(z,v);
+ v = v + align_shift_right<8>(z,v);
+ return v;
+ }
+
+ /* this should use a vbool8 and a vint8_64...*/
+ template<int scale = 1, int hint = _MM_HINT_T0>
+ __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+ {
+#if defined(__AVX512PF__)
+ _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+ {
+ cout << "<" << v[0];
+ for (int i=1; i<16; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
new file mode 100644
index 0000000000..9814d5c71c
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -0,0 +1,598 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide SSE integer type */
+ template<>
+ struct vint<4>
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef vboolf4 Bool;
+ typedef vint4 Int;
+ typedef vfloat4 Float;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128i v; int i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint() {}
+ __forceinline vint(const vint4& a) { v = a.v; }
+ __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+ __forceinline vint(__m128i a) : v(a) {}
+ __forceinline operator const __m128i&() const { return v; }
+ __forceinline operator __m128i&() { return v; }
+
+ __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+ __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+ __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+ __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+ __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+ __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {}
+ __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {}
+ __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+ __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+ __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {}
+ __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+ __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); }
+ __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+ static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+ static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+ static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+ return _mm_mask_compress_epi32(v, mask, v);
+ }
+ static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+ return _mm_mask_compress_epi32(a, mask, b);
+ }
+
+ static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+ static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+ static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+ static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+ static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+ static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__SSE4_1__)
+ static __forceinline vint4 load(const unsigned char* ptr) {
+ return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+ }
+
+ static __forceinline vint4 loadu(const unsigned char* ptr) {
+ return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+ }
+#else
+
+ static __forceinline vint4 load(const unsigned char* ptr) {
+ return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+ }
+
+ static __forceinline vint4 loadu(const unsigned char* ptr) {
+ return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+ }
+
+#endif
+
+ static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+ return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+ return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+ }
+
+ static __forceinline void store(unsigned char* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+ __m128i x = v;
+ x = _mm_packus_epi32(x, x);
+ x = _mm_packus_epi16(x, x);
+ *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+ for (size_t i=0;i<4;i++)
+ ptr[i] = (unsigned char)v[i];
+#endif
+ }
+
+ static __forceinline void store(unsigned short* ptr, const vint4& v) {
+ for (size_t i=0;i<4;i++)
+ ptr[i] = (unsigned short)v[i];
+ }
+
+ static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+ return _mm_stream_load_si128((__m128i*)ptr);
+#else
+ return _mm_load_si128((__m128i*)ptr);
+#endif
+ }
+
+ static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+ _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+ _mm_store_si128((__m128i*)ptr,v);
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+ return _mm_i32gather_epi32(ptr, index, scale);
+#else
+ return vint4(
+ *(int*)(((char*)ptr)+scale*index[0]),
+ *(int*)(((char*)ptr)+scale*index[1]),
+ *(int*)(((char*)ptr)+scale*index[2]),
+ *(int*)(((char*)ptr)+scale*index[3]));
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+ vint4 r = zero;
+#if defined(__AVX512VL__)
+ return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+ return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+ if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+ return r;
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+ {
+#if defined(__AVX512VL__)
+ _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+ *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+ *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+ *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+ *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+ {
+#if defined(__AVX512VL__)
+ _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+ if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+ if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+ if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+ if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+ }
+
+#if defined(__x86_64__)
+ static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+ __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; }
+
+ friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+ return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+ return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
+#endif
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+ __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+ __forceinline vint4 operator +(const vint4& a) { return a; }
+ __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__SSSE3__)
+ __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+ __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); }
+ __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; }
+
+ __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+ __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); }
+ __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; }
+
+#if defined(__SSE4_1__)
+ __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+ __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+ __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); }
+ __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; }
+
+ __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+ __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); }
+ __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; }
+
+ __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+ __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); }
+ __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; }
+
+ __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+ __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); }
+ __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; }
+
+ __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
+ __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
+
+ __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+ __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+ __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+ __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; }
+
+ __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+ __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
+
+#if defined(__SSE4_1__)
+ __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+ __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; }
+#endif
+
+ __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+ __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; }
+
+ __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+ __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; }
+
+ __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+ __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+ __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+ __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+ __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); }
+ __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+ __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); }
+#endif
+
+ __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); }
+ __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; }
+
+ __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); }
+ __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; }
+
+ __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); }
+ __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; }
+
+ __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); }
+ __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; }
+
+ __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); }
+ __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; }
+
+ __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); }
+ __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; }
+
+ __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+ __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+ __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; }
+ __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+ __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; }
+ __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+ __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+ __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+ __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+ __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+ __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); }
+ __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+ __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); }
+ __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+ template<int mask>
+ __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+ return select(vboolf4(mask), t, f);
+#endif
+ }
+
+#if defined(__SSE4_1__)
+ __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+ __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+ __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+ __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+ __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+ __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+ __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); }
+ __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); }
+ __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); }
+ __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+ __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint4 shuffle(const vint4& v) {
+ return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+ return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+#if defined(__SSE3__)
+ template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+ template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+ template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+ template<int i>
+ __forceinline vint4 shuffle(const vint4& v) {
+ return shuffle<i,i,i,i>(v);
+ }
+
+#if defined(__SSE4_1__)
+ template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+ template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+ template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+ template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+ template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+
+ __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+
+ __forceinline size_t toSizeT(const vint4& v) {
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+ return toScalar(v);
+#else
+ return _mm_cvtsi128_si64(v);
+#endif
+ }
+
+#if defined(__AVX512VL__)
+
+ __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+ return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+ }
+
+ template<int i>
+ __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+ return _mm_alignr_epi32(a, b, i);
+ }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+ __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+ __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+ __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
+
+ __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+ __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+ __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+ __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+ __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+ __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Sorting networks
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+
+ __forceinline vint4 usort_ascending(const vint4& v)
+ {
+ const vint4 a0 = v;
+ const vint4 b0 = shuffle<1,0,3,2>(a0);
+ const vint4 c0 = umin(a0,b0);
+ const vint4 d0 = umax(a0,b0);
+ const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vint4 b1 = shuffle<2,3,0,1>(a1);
+ const vint4 c1 = umin(a1,b1);
+ const vint4 d1 = umax(a1,b1);
+ const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vint4 b2 = shuffle<0,2,1,3>(a2);
+ const vint4 c2 = umin(a2,b2);
+ const vint4 d2 = umax(a2,b2);
+ const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3;
+ }
+
+ __forceinline vint4 usort_descending(const vint4& v)
+ {
+ const vint4 a0 = v;
+ const vint4 b0 = shuffle<1,0,3,2>(a0);
+ const vint4 c0 = umax(a0,b0);
+ const vint4 d0 = umin(a0,b0);
+ const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vint4 b1 = shuffle<2,3,0,1>(a1);
+ const vint4 c1 = umax(a1,b1);
+ const vint4 d1 = umin(a1,b1);
+ const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vint4 b2 = shuffle<0,2,1,3>(a2);
+ const vint4 c2 = umax(a2,b2);
+ const vint4 d2 = umin(a2,b2);
+ const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3;
+ }
+
+#else
+
+ __forceinline vint4 usort_ascending(const vint4& v)
+ {
+ const vint4 a0 = v-vint4(0x80000000);
+ const vint4 b0 = shuffle<1,0,3,2>(a0);
+ const vint4 c0 = min(a0,b0);
+ const vint4 d0 = max(a0,b0);
+ const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vint4 b1 = shuffle<2,3,0,1>(a1);
+ const vint4 c1 = min(a1,b1);
+ const vint4 d1 = max(a1,b1);
+ const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vint4 b2 = shuffle<0,2,1,3>(a2);
+ const vint4 c2 = min(a2,b2);
+ const vint4 d2 = max(a2,b2);
+ const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3+vint4(0x80000000);
+ }
+
+ __forceinline vint4 usort_descending(const vint4& v)
+ {
+ const vint4 a0 = v-vint4(0x80000000);
+ const vint4 b0 = shuffle<1,0,3,2>(a0);
+ const vint4 c0 = max(a0,b0);
+ const vint4 d0 = min(a0,b0);
+ const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+ const vint4 b1 = shuffle<2,3,0,1>(a1);
+ const vint4 c1 = max(a1,b1);
+ const vint4 d1 = min(a1,b1);
+ const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+ const vint4 b2 = shuffle<0,2,1,3>(a2);
+ const vint4 c2 = max(a2,b2);
+ const vint4 d2 = min(a2,b2);
+ const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+ return a3+vint4(0x80000000);
+ }
+
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h
new file mode 100644
index 0000000000..f43e9a8c22
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -0,0 +1,470 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX integer type */
+ template<>
+ struct vint<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256i v;
+ struct { __m128i vl,vh; };
+ int i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint() {}
+ __forceinline vint(const vint8& a) { v = a.v; }
+ __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+ __forceinline vint(__m256i a) : v(a) {}
+ __forceinline operator const __m256i&() const { return v; }
+ __forceinline operator __m256i&() { return v; }
+
+ __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+ __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+
+ __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+ __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+ __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+ __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+ __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {}
+ __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+ __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+ __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+ __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+ __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+ __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+ static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+ static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+ static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+ static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+ static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+ static __forceinline void store_nt(void* ptr, const vint8& v) {
+ _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+ }
+
+ static __forceinline vint8 load(const unsigned char* ptr) {
+ vint4 il = vint4::load(ptr+0);
+ vint4 ih = vint4::load(ptr+4);
+ return vint8(il,ih);
+ }
+
+ static __forceinline vint8 loadu(const unsigned char* ptr) {
+ vint4 il = vint4::loadu(ptr+0);
+ vint4 ih = vint4::loadu(ptr+4);
+ return vint8(il,ih);
+ }
+
+ static __forceinline vint8 load(const unsigned short* ptr) {
+ vint4 il = vint4::load(ptr+0);
+ vint4 ih = vint4::load(ptr+4);
+ return vint8(il,ih);
+ }
+
+ static __forceinline vint8 loadu(const unsigned short* ptr) {
+ vint4 il = vint4::loadu(ptr+0);
+ vint4 ih = vint4::loadu(ptr+4);
+ return vint8(il,ih);
+ }
+
+ static __forceinline void store(unsigned char* ptr, const vint8& i) {
+ vint4 il(i.vl);
+ vint4 ih(i.vh);
+ vint4::store(ptr + 0,il);
+ vint4::store(ptr + 4,ih);
+ }
+
+ static __forceinline void store(unsigned short* ptr, const vint8& v) {
+ for (size_t i=0;i<8;i++)
+ ptr[i] = (unsigned short)v[i];
+ }
+
+ template<int scale = 4>
+ static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+ return vint8(
+ *(int*)(((char*)ptr)+scale*index[0]),
+ *(int*)(((char*)ptr)+scale*index[1]),
+ *(int*)(((char*)ptr)+scale*index[2]),
+ *(int*)(((char*)ptr)+scale*index[3]),
+ *(int*)(((char*)ptr)+scale*index[4]),
+ *(int*)(((char*)ptr)+scale*index[5]),
+ *(int*)(((char*)ptr)+scale*index[6]),
+ *(int*)(((char*)ptr)+scale*index[7]));
+ }
+
+ template<int scale = 4>
+ static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+ vint8 r = zero;
+ if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+ if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
+ if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
+ if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
+ if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
+ return r;
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+ {
+ *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+ {
+ if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+ }
+
+
+ static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+ __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+ __forceinline vint8 operator +(const vint8& a) { return a; }
+ __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+ __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+ __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); }
+ __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; }
+
+ __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+ __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); }
+ __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; }
+
+ __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+ __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); }
+ __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; }
+
+ __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); }
+ __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; }
+
+ __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); }
+ __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; }
+
+ __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); }
+ __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; }
+
+ __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+ __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+ __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+ __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+ __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+
+ __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+ __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); }
+ __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); }
+
+ __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+ __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); }
+ __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); }
+
+ __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+ __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); }
+ __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); }
+
+ __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+ __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); }
+ __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+ __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; }
+
+ __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+ __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; }
+
+ __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+ __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; }
+
+ __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+ __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; }
+
+ __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+ __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; }
+
+ __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+ __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+ _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+ __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); }
+ __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; }
+
+ __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+ __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); }
+ __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; }
+
+ __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+ _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+ __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); }
+ __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; }
+
+ __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); }
+ __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); }
+ __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; }
+
+ __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+ _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+ __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); }
+ __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; }
+
+ __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); }
+ __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); }
+ __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; }
+
+ __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+ __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+ __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; }
+ __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+ __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; }
+ __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+ __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+ __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+ __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); }
+ __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+ __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); }
+ __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+ __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+ template<int i>
+ __forceinline vint8 shuffle(const vint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1>
+ __forceinline vint8 shuffle4(const vint8& v) {
+ return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+ return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint8 shuffle(const vint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+ return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+ __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+ template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+ template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+ template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+ __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+ __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+ __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+ __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+ __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+ __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Sorting networks
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 usort_ascending(const vint8& v)
+ {
+ const vint8 a0 = v;
+ const vint8 b0 = shuffle<1,0,3,2>(a0);
+ const vint8 c0 = umin(a0,b0);
+ const vint8 d0 = umax(a0,b0);
+ const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+ const vint8 b1 = shuffle<2,3,0,1>(a1);
+ const vint8 c1 = umin(a1,b1);
+ const vint8 d1 = umax(a1,b1);
+ const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+ const vint8 b2 = shuffle<1,0,3,2>(a2);
+ const vint8 c2 = umin(a2,b2);
+ const vint8 d2 = umax(a2,b2);
+ const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+ const vint8 b3 = shuffle4<1,0>(a3);
+ const vint8 c3 = umin(a3,b3);
+ const vint8 d3 = umax(a3,b3);
+ const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+ const vint8 b4 = shuffle<2,3,0,1>(a4);
+ const vint8 c4 = umin(a4,b4);
+ const vint8 d4 = umax(a4,b4);
+ const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+ const vint8 b5 = shuffle<1,0,3,2>(a5);
+ const vint8 c5 = umin(a5,b5);
+ const vint8 d5 = umax(a5,b5);
+ const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+ return a6;
+ }
+
+ __forceinline vint8 usort_descending(const vint8& v)
+ {
+ const vint8 a0 = v;
+ const vint8 b0 = shuffle<1,0,3,2>(a0);
+ const vint8 c0 = umax(a0,b0);
+ const vint8 d0 = umin(a0,b0);
+ const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+ const vint8 b1 = shuffle<2,3,0,1>(a1);
+ const vint8 c1 = umax(a1,b1);
+ const vint8 d1 = umin(a1,b1);
+ const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+ const vint8 b2 = shuffle<1,0,3,2>(a2);
+ const vint8 c2 = umax(a2,b2);
+ const vint8 d2 = umin(a2,b2);
+ const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+ const vint8 b3 = shuffle4<1,0>(a3);
+ const vint8 c3 = umax(a3,b3);
+ const vint8 d3 = umin(a3,b3);
+ const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+ const vint8 b4 = shuffle<2,3,0,1>(a4);
+ const vint8 c4 = umax(a4,b4);
+ const vint8 d4 = umin(a4,b4);
+ const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+ const vint8 b5 = shuffle<1,0,3,2>(a5);
+ const vint8 c5 = umax(a5,b5);
+ const vint8 d5 = umin(a5,b5);
+ const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+ return a6;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h
new file mode 100644
index 0000000000..e04737ffbe
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX integer type */
+ template<>
+ struct vint<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256i v;
+ int i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint() {}
+ __forceinline vint(const vint8& a) { v = a.v; }
+ __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+ __forceinline vint(__m256i a) : v(a) {}
+ __forceinline operator const __m256i&() const { return v; }
+ __forceinline operator __m256i&() { return v; }
+
+ __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+ __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+
+ __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+ __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+ __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+ __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+ __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+ __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+ __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {}
+ __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {}
+ __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+ __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+ __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+ __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+ __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+ static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+ static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+ static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+ static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+ static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+ static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+ static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+ return _mm256_mask_compress_epi32(v, mask, v);
+ }
+ static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+ return _mm256_mask_compress_epi32(a, mask, b);
+ }
+
+ static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+ static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+ static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+ static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+
+ static __forceinline vint8 load_nt(void* ptr) {
+ return _mm256_stream_load_si256((__m256i*)ptr);
+ }
+
+ static __forceinline void store_nt(void* ptr, const vint8& v) {
+ _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+ }
+
+ static __forceinline void store(unsigned char* ptr, const vint8& i)
+ {
+ for (size_t j=0; j<8; j++)
+ ptr[j] = i[j];
+ }
+
+ static __forceinline void store(unsigned short* ptr, const vint8& v) {
+ for (size_t i=0;i<8;i++)
+ ptr[i] = (unsigned short)v[i];
+ }
+
+ template<int scale = 4>
+ static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+ return _mm256_i32gather_epi32(ptr, index, scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+ vint8 r = zero;
+#if defined(__AVX512VL__)
+ return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+ return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+ *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+ if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+ __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+ static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+ __forceinline vint8 operator +(const vint8& a) { return a; }
+ __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+ __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+ __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); }
+ __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; }
+
+ __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+ __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); }
+ __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; }
+
+ __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+ __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); }
+ __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; }
+
+ __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+ __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); }
+ __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; }
+
+ __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+ __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); }
+ __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; }
+
+ __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+ __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); }
+ __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; }
+
+ __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+ __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+ __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+ __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+ __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+ __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+ __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+ __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+ __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+ __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+
+ __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+ __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); }
+ __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); }
+
+ __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+ __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); }
+ __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); }
+
+ __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+ __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+ __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; }
+
+ __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+ __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; }
+
+ __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+ __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; }
+
+ __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+ __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; }
+
+ __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+ __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; }
+
+ __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+ __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+ static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+ static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+ static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+ static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+ static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+ static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+ return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+ }
+#else
+ static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+ static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+ static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+ static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); }
+ static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+ static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); }
+
+ static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+ }
+#endif
+
+ template<int mask>
+ __forceinline vint8 select(const vint8& t, const vint8& f) {
+ return _mm256_blend_epi32(f, t, mask);
+ }
+
+ __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); }
+ __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; }
+
+ __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); }
+ __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; }
+
+ __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); }
+ __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; }
+
+ __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); }
+ __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; }
+
+ __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); }
+ __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; }
+
+ __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); }
+ __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; }
+
+ __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+ __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+ __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; }
+ __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+ __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; }
+ __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+ static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+ static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+ static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+ static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+ static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+ static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+ static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); }
+ static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+ static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); }
+ static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+ __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+ template<int i>
+ __forceinline vint8 shuffle(const vint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1>
+ __forceinline vint8 shuffle4(const vint8& v) {
+ return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+ return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint8 shuffle(const vint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+ return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+ __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+ template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+ template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+ template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+ __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+ __forceinline vint8 permute(const vint8& v, const __m256i& index) {
+ return _mm256_permutevar8x32_epi32(v, index);
+ }
+
+ __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+ return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+ }
+
+ template<int i>
+ static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+ return _mm256_alignr_epi32(a, b, i);
+#else
+ return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+ __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+ __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+ __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+ __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+ __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+ __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+ __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Sorting networks
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vint8 usort_ascending(const vint8& v)
+ {
+ const vint8 a0 = v;
+ const vint8 b0 = shuffle<1,0,3,2>(a0);
+ const vint8 c0 = umin(a0,b0);
+ const vint8 d0 = umax(a0,b0);
+ const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+ const vint8 b1 = shuffle<2,3,0,1>(a1);
+ const vint8 c1 = umin(a1,b1);
+ const vint8 d1 = umax(a1,b1);
+ const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+ const vint8 b2 = shuffle<1,0,3,2>(a2);
+ const vint8 c2 = umin(a2,b2);
+ const vint8 d2 = umax(a2,b2);
+ const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+ const vint8 b3 = shuffle4<1,0>(a3);
+ const vint8 c3 = umin(a3,b3);
+ const vint8 d3 = umax(a3,b3);
+ const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+ const vint8 b4 = shuffle<2,3,0,1>(a4);
+ const vint8 c4 = umin(a4,b4);
+ const vint8 d4 = umax(a4,b4);
+ const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+ const vint8 b5 = shuffle<1,0,3,2>(a5);
+ const vint8 c5 = umin(a5,b5);
+ const vint8 d5 = umax(a5,b5);
+ const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+ return a6;
+ }
+
+ __forceinline vint8 usort_descending(const vint8& v)
+ {
+ const vint8 a0 = v;
+ const vint8 b0 = shuffle<1,0,3,2>(a0);
+ const vint8 c0 = umax(a0,b0);
+ const vint8 d0 = umin(a0,b0);
+ const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+ const vint8 b1 = shuffle<2,3,0,1>(a1);
+ const vint8 c1 = umax(a1,b1);
+ const vint8 d1 = umin(a1,b1);
+ const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+ const vint8 b2 = shuffle<1,0,3,2>(a2);
+ const vint8 c2 = umax(a2,b2);
+ const vint8 d2 = umin(a2,b2);
+ const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+ const vint8 b3 = shuffle4<1,0>(a3);
+ const vint8 c3 = umax(a3,b3);
+ const vint8 d3 = umin(a3,b3);
+ const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+ const vint8 b4 = shuffle<2,3,0,1>(a4);
+ const vint8 c4 = umax(a4,b4);
+ const vint8 d4 = umin(a4,b4);
+ const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+ const vint8 b5 = shuffle<1,0,3,2>(a5);
+ const vint8 c5 = umax(a5,b5);
+ const vint8 d5 = umin(a5,b5);
+ const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+ return a6;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong4_avx2.h b/thirdparty/embree/common/simd/vllong4_avx2.h
new file mode 100644
index 0000000000..6c86845877
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong4_avx2.h
@@ -0,0 +1,352 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide AVX2 64-bit long long type */
+ template<>
+ struct vllong<4>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboold4 Bool;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { // data
+ __m256i v;
+ long long i[4];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong() {}
+ __forceinline vllong(const vllong4& t) { v = t.v; }
+ __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+ __forceinline vllong(const __m256i& t) { v = t; }
+ __forceinline operator __m256i() const { return v; }
+ __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+ __forceinline vllong(long long i) {
+ v = _mm256_set1_epi64x(i);
+ }
+
+ __forceinline vllong(long long a, long long b, long long c, long long d) {
+ v = _mm256_set_epi64x(d,c,b,a);
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+ __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {}
+ __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+ __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+ _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+ }
+
+ static __forceinline vllong4 loadu(const void* addr)
+ {
+ return _mm256_loadu_si256((__m256i*)addr);
+ }
+
+ static __forceinline vllong4 load(const vllong4* addr) {
+ return _mm256_load_si256((__m256i*)addr);
+ }
+
+ static __forceinline vllong4 load(const long long* addr) {
+ return _mm256_load_si256((__m256i*)addr);
+ }
+
+ static __forceinline void store(void* ptr, const vllong4& v) {
+ _mm256_store_si256((__m256i*)ptr,v);
+ }
+
+ static __forceinline void storeu(void* ptr, const vllong4& v) {
+ _mm256_storeu_si256((__m256i*)ptr,v);
+ }
+
+ static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+ _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+ _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+ }
+
+ static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+ _mm256_mask_store_epi64(ptr,mask,f);
+#else
+ _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; }
+ __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+ #if defined(__AVX512VL__)
+ return _mm256_mask_blend_epi64(m, f, t);
+ #else
+ return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+ #endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+ __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+ __forceinline vllong4 operator +(const vllong4& a) { return a; }
+ __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+ __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); }
+ __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; }
+
+ __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+ __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); }
+ __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; }
+
+ /* only low 32bit part */
+ __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+ __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); }
+ __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; }
+
+ __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+ __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); }
+ __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; }
+
+ __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+ __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); }
+ __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; }
+
+ __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+ __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); }
+ __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; }
+
+ __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+ //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+ __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+ //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+ //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+ __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+
+ //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+ //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); }
+ //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); }
+
+ //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+ //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); }
+ //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+ __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+ __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+ __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+ __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+ __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; }
+
+ __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+ __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; }
+
+ __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+ __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; }
+
+ __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+ __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; }
+
+ __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+ __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; }
+
+ __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; }
+ //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+ __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+ __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+ __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+ __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+ __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+ __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+ __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); }
+ __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; }
+
+ __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); }
+ __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; }
+
+ __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); }
+ __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; }
+
+ __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); }
+ __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; }
+
+ __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); }
+ __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; }
+
+ __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); }
+ __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; }
+
+ __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+ __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+ __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; }
+ __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+ __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; }
+ __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+ __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+ __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+ __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+ __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+ __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+ __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); }
+ __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+ __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); }
+ __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int i0, int i1>
+ __forceinline vllong4 shuffle(const vllong4& v) {
+ return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+ }
+
+ template<int i>
+ __forceinline vllong4 shuffle(const vllong4& v) {
+ return shuffle<i, i>(v);
+ }
+
+ template<int i0, int i1>
+ __forceinline vllong4 shuffle2(const vllong4& v) {
+ return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+ }
+
+ __forceinline long long toScalar(const vllong4& v) {
+ return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+ }
+
+#if defined(__AVX512VL__)
+ __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+ // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+ return _mm256_permutex2var_epi64(a,index,a);
+#else
+ return _mm256_permutexvar_epi64(index,a);
+#endif
+ }
+
+ __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+ return _mm256_permutex2var_epi64(a,index,b);
+ }
+
+#endif
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+
+ __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+ __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+ __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+ __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+ __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+ __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+ __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+ __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+ __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+ {
+ cout << "<" << v[0];
+ for (size_t i=1; i<4; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h
new file mode 100644
index 0000000000..ee69411637
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong8_avx512.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX-512 64-bit long long type */
+ template<>
+ struct vllong<8>
+ {
+ ALIGNED_STRUCT_(64);
+
+ typedef vboold8 Bool;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m512i v;
+ long long i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong() {}
+ __forceinline vllong(const vllong8& t) { v = t.v; }
+ __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+ __forceinline vllong(const __m512i& t) { v = t; }
+ __forceinline operator __m512i() const { return v; }
+ __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+ __forceinline vllong(long long i) {
+ v = _mm512_set1_epi64(i);
+ }
+
+ __forceinline vllong(long long a, long long b, long long c, long long d) {
+ v = _mm512_set4_epi64(d,c,b,a);
+ }
+
+ __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+ long long a4, long long a5, long long a6, long long a7)
+ {
+ v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+ }
+
+ __forceinline vllong(const vllong<4>& i) {
+ v = _mm512_broadcast_i64x4(i);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+ __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {}
+ __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+ __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+ _mm512_stream_si512((__m512i*)ptr,a);
+ }
+
+ static __forceinline vllong8 loadu(const void* addr) {
+ return _mm512_loadu_si512(addr);
+ }
+
+ static __forceinline vllong8 load(const vllong8* addr) {
+ return _mm512_load_si512(addr);
+ }
+
+ static __forceinline vllong8 load(const long long* addr) {
+ return _mm512_load_si512(addr);
+ }
+
+ static __forceinline vllong8 load(const unsigned char* ptr) {
+ return _mm512_cvtepu8_epi64(*(__m128i*)ptr);
+ }
+
+ static __forceinline void store(void* ptr, const vllong8& v) {
+ _mm512_store_si512(ptr,v);
+ }
+
+ static __forceinline void storeu(void* ptr, const vllong8& v) {
+ _mm512_storeu_si512(ptr,v);
+ }
+
+ static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+ _mm512_mask_storeu_epi64(ptr,mask,f);
+ }
+
+ static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+ _mm512_mask_store_epi64(addr,mask,v2);
+ }
+
+ static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+ return _mm512_mask_compress_epi64(v,mask,v);
+ }
+
+ static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+ return _mm512_mask_compress_epi64(a,mask,b);
+ }
+
+ static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+ return _mm512_mask_expand_epi64(b,mask,a);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; }
+ __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+ __forceinline vllong8 operator +(const vllong8& a) { return a; }
+ __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+ __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); }
+ __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; }
+
+ __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+ __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); }
+ __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; }
+
+ __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+ __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); }
+ __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; }
+
+ __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+ __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); }
+ __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; }
+
+ __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+ __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); }
+ __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; }
+
+ __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+ __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); }
+ __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; }
+
+ __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+ __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+ __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+ __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+ __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+ __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+ __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+ __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+ __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); }
+ __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); }
+
+ __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+ __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); }
+ __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); }
+
+ __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+ __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+ __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+ __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; }
+ __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; }
+
+ __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; }
+ __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; }
+
+ __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; }
+ __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; }
+
+ __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; }
+ __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; }
+
+ __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; }
+ __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; }
+
+ __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+ __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); }
+ __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; }
+
+ __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); }
+ __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; }
+
+ __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); }
+ __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; }
+
+ __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); }
+ __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; }
+
+ __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); }
+ __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; }
+
+ __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); }
+ __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; }
+
+ __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+
+ __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+ __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+ __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+ __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+ __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+ __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+ __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+ return _mm512_mask_or_epi64(f,m,t,t);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int i0, int i1>
+ __forceinline vllong8 shuffle(const vllong8& v) {
+ return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+ }
+
+ template<int i>
+ __forceinline vllong8 shuffle(const vllong8& v) {
+ return shuffle<i, i>(v);
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vllong8 shuffle(const vllong8& v) {
+ return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vllong8 shuffle4(const vllong8& v) {
+ return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+ }
+
+ template<int i>
+ __forceinline vllong8 shuffle4(const vllong8& v) {
+ return shuffle4<i, i>(v);
+ }
+
+ template<int i>
+ __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+ return _mm512_alignr_epi64(a, b, i);
+ };
+
+ __forceinline long long toScalar(const vllong8& v) {
+ return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+ __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+ __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); }
+ __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+ __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+ __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); }
+ __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+ __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+ __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); }
+ __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+ __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+ __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+ __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+ __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+ __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Memory load and store operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+ return _mm512_permutexvar_epi64(index,v);
+ }
+
+ __forceinline vllong8 reverse(const vllong8& a) {
+ return permute(a,vllong8(reverse_step));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+ {
+ cout << "<" << v[0];
+ for (size_t i=1; i<8; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h
new file mode 100644
index 0000000000..c9eb6682ff
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint16_avx512.h
@@ -0,0 +1,424 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 16-wide AVX-512 unsigned integer type */
+ template<>
+ struct vuint<16>
+ {
+ ALIGNED_STRUCT_(64);
+
+ typedef vboolf16 Bool;
+ typedef vuint16 UInt;
+ typedef vfloat16 Float;
+
+ enum { size = 16 }; // number of SIMD elements
+ union { // data
+ __m512i v;
+ unsigned int i[16];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint() {}
+ __forceinline vuint(const vuint16& t) { v = t.v; }
+ __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+ __forceinline vuint(const __m512i& t) { v = t; }
+ __forceinline operator __m512i() const { return v; }
+ __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+ __forceinline vuint(unsigned int i) {
+ v = _mm512_set1_epi32(i);
+ }
+
+ __forceinline vuint(const vuint4& i) {
+ v = _mm512_broadcast_i32x4(i);
+ }
+
+ __forceinline vuint(const vuint8& i) {
+ v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+ }
+
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+ v = _mm512_set4_epi32(d,c,b,a);
+ }
+
+ __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+ unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+ unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+ unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+ {
+ v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+ }
+
+ __forceinline explicit vuint(const __m512& f) {
+ v = _mm512_cvtps_epu32(f);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+ __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {}
+ __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+ __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+ _mm512_stream_si512((__m512i*)ptr,a);
+ }
+
+ static __forceinline vuint16 loadu(const void* addr)
+ {
+ return _mm512_loadu_si512(addr);
+ }
+
+ static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+ static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+ static __forceinline vuint16 load(const vuint16* addr) {
+ return _mm512_load_si512(addr);
+ }
+
+ static __forceinline vuint16 load(const unsigned int* addr) {
+ return _mm512_load_si512(addr);
+ }
+
+ static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+ static __forceinline void store(void* ptr, const vuint16& v) {
+ _mm512_store_si512(ptr,v);
+ }
+
+ static __forceinline void storeu(void* ptr, const vuint16& v) {
+ _mm512_storeu_si512(ptr,v);
+ }
+
+ static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+ _mm512_mask_storeu_epi32(ptr,mask,f);
+ }
+
+ static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+ _mm512_mask_store_epi32(addr,mask,v2);
+ }
+
+ static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+ return _mm512_mask_compress_epi32(v,mask,v);
+ }
+
+ static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+ return _mm512_mask_compress_epi32(a,mask,b);
+ }
+
+ static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+ return _mm512_mask_expand_epi32(b,mask,a);
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+ return _mm512_i32gather_epi32(index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+ return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+ return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+ _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+ _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; }
+ __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+ __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+ __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+ __forceinline vuint16 operator +(const vuint16& a) { return a; }
+ __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+ __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); }
+ __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; }
+
+ __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+ __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); }
+ __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; }
+
+ __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+ __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); }
+ __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; }
+
+ __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+ __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); }
+ __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; }
+
+ __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+ __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); }
+ __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; }
+
+ __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+ __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); }
+ __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; }
+
+ __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+ __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+ __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+ __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+ __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+ __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+ __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+
+ __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+ __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); }
+ __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); }
+
+ __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+ __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); }
+ __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); }
+
+ __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+ __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+ __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+ __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+ __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; }
+
+ __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+ __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; }
+
+ __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+ __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; }
+
+ __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+ __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; }
+
+ __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+ __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; }
+
+ __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+ __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); }
+ __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; }
+
+ __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); }
+ __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; }
+
+ __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); }
+ __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; }
+
+ __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); }
+ __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; }
+
+ __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); }
+ __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; }
+
+ __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+ __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); }
+ __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; }
+
+ __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+ __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+
+
+ __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+ return _mm512_mask_or_epi32(f,m,t,t);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int i>
+ __forceinline vuint16 shuffle(const vuint16& v) {
+ return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint16 shuffle(const vuint16& v) {
+ return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i>
+ __forceinline vuint16 shuffle4(const vuint16& v) {
+ return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint16 shuffle4(const vuint16& v) {
+ return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i>
+ __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+ return _mm512_alignr_epi32(a, b, i);
+ };
+
+ __forceinline unsigned int toScalar(const vuint16& v) {
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); }
+ __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+ __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+ __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+ __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+ __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+ __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+ __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+ __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+ __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+ __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+ __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+ __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+ __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+ __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+ __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Memory load and store operations
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+ return _mm512_permutexvar_epi32(index,v);
+ }
+
+ __forceinline vuint16 reverse(const vuint16& a) {
+ return permute(a,vuint16(reverse_step));
+ }
+
+ __forceinline vuint16 prefix_sum(const vuint16& a)
+ {
+ const vuint16 z(zero);
+ vuint16 v = a;
+ v = v + align_shift_right<16-1>(v,z);
+ v = v + align_shift_right<16-2>(v,z);
+ v = v + align_shift_right<16-4>(v,z);
+ v = v + align_shift_right<16-8>(v,z);
+ return v;
+ }
+
+ __forceinline vuint16 reverse_prefix_sum(const vuint16& a)
+ {
+ const vuint16 z(zero);
+ vuint16 v = a;
+ v = v + align_shift_right<1>(z,v);
+ v = v + align_shift_right<2>(z,v);
+ v = v + align_shift_right<4>(z,v);
+ v = v + align_shift_right<8>(z,v);
+ return v;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+ {
+ cout << "<" << v[0];
+ for (int i=1; i<16; i++) cout << ", " << v[i];
+ cout << ">";
+ return cout;
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
new file mode 100644
index 0000000000..0601b9ab80
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -0,0 +1,426 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 4-wide SSE integer type */
+ template<>
+ struct vuint<4>
+ {
+ ALIGNED_STRUCT_(16);
+
+ typedef vboolf4 Bool;
+ typedef vuint4 Int;
+ typedef vfloat4 Float;
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128i v; unsigned int i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint() {}
+ __forceinline vuint(const vuint4& a) { v = a.v; }
+ __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+ __forceinline vuint(const __m128i a) : v(a) {}
+ __forceinline operator const __m128i&() const { return v; }
+ __forceinline operator __m128i&() { return v; }
+
+
+ __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+ __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+ __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+ __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {}
+ __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {}
+ __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+ __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {}
+ __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); }
+ __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+ static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+ static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+
+#if defined(__AVX512VL__)
+ static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+ static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+ static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+ static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+ static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+ static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+ static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+ static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__SSE4_1__)
+ static __forceinline vuint4 load(const unsigned char* ptr) {
+ return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+ }
+
+ static __forceinline vuint4 loadu(const unsigned char* ptr) {
+ return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+ }
+
+#endif
+
+ static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+ return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+ return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+ }
+
+ static __forceinline vuint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+ return _mm_stream_load_si128((__m128i*)ptr);
+#else
+ return _mm_load_si128((__m128i*)ptr);
+#endif
+ }
+
+ static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if defined(__SSE4_1__)
+ _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v));
+#else
+ _mm_store_si128((__m128i*)ptr,v);
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+ return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+ return vuint4(
+ *(unsigned int*)(((char*)ptr)+scale*index[0]),
+ *(unsigned int*)(((char*)ptr)+scale*index[1]),
+ *(unsigned int*)(((char*)ptr)+scale*index[2]),
+ *(unsigned int*)(((char*)ptr)+scale*index[3]));
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+ vuint4 r = zero;
+#if defined(__AVX512VL__)
+ return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+ return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+ if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+ return r;
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+ __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; }
+
+ friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+ return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+ return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
+#endif
+ }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+ __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+ __forceinline vuint4 operator +(const vuint4& a) { return a; }
+ __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+ __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); }
+ __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; }
+
+ __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+ __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); }
+ __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); }
+// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; }
+
+ __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+ __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); }
+ __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; }
+
+ __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+ __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); }
+ __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; }
+
+ __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+ __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); }
+ __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; }
+
+ __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+ __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+ __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+ __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+ __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+ __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; }
+
+ __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+ __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; }
+//#endif
+
+ __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+ __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; }
+
+ __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+ __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; }
+
+ __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; }
+ __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+ //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+ //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+ //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+ //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+ __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+ //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+ //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); }
+ //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+ //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); }
+#endif
+
+ __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); }
+ __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; }
+
+ __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); }
+ __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; }
+
+ //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); }
+ //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; }
+
+ //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); }
+ //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; }
+
+ //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); }
+ //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; }
+
+ //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); }
+ //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; }
+
+ __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+ __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+ //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; }
+ //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+ //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; }
+ //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+ //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+ //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+ //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+ //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+ __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+ //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); }
+ //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+ //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); }
+ //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+ template<int mask>
+ __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+ return select(vboolf4(mask), t, f);
+#endif
+ }
+
+/*#if defined(__SSE4_1__)
+ __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+ __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+ __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+ __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+ __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); }
+ __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); }
+ __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); }
+ __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+ __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint4 shuffle(const vuint4& v) {
+ return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+ return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+#if defined(__SSE3__)
+ template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+ template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+ template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+ template<int i>
+ __forceinline vuint4 shuffle(const vuint4& v) {
+ return shuffle<i,i,i,i>(v);
+ }
+
+#if defined(__SSE4_1__)
+ template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+ template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+ template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+ template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+ template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+ __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+ __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+ __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+ __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
+
+ __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+ __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+ __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+ __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+ __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+ __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+ __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+ __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h
new file mode 100644
index 0000000000..589cd9d731
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX integer type */
+ template<>
+ struct vuint<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vuint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256i v;
+ struct { __m128i vl,vh; };
+ unsigned int i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint() {}
+ __forceinline vuint(const vuint8& a) { v = a.v; }
+ __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+ __forceinline vuint(__m256i a) : v(a) {}
+ __forceinline operator const __m256i&() const { return v; }
+ __forceinline operator __m256i&() { return v; }
+
+ __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+ __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+
+ __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+ __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {}
+ __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {}
+ __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+ __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+ __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+ static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+ static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+ static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+ static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+ static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+ static __forceinline void store_nt(void* ptr, const vuint8& v) {
+ _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+ }
+
+ static __forceinline vuint8 load(const unsigned char* ptr) {
+ vuint4 il = vuint4::load(ptr+0);
+ vuint4 ih = vuint4::load(ptr+4);
+ return vuint8(il,ih);
+ }
+
+ static __forceinline vuint8 loadu(const unsigned char* ptr) {
+ vuint4 il = vuint4::loadu(ptr+0);
+ vuint4 ih = vuint4::loadu(ptr+4);
+ return vuint8(il,ih);
+ }
+
+ static __forceinline vuint8 load(const unsigned short* ptr) {
+ vuint4 il = vuint4::load(ptr+0);
+ vuint4 ih = vuint4::load(ptr+4);
+ return vuint8(il,ih);
+ }
+
+ static __forceinline vuint8 loadu(const unsigned short* ptr) {
+ vuint4 il = vuint4::loadu(ptr+0);
+ vuint4 ih = vuint4::loadu(ptr+4);
+ return vuint8(il,ih);
+ }
+
+ static __forceinline void store(unsigned char* ptr, const vuint8& i) {
+ vuint4 il(i.vl);
+ vuint4 ih(i.vh);
+ vuint4::store(ptr + 0,il);
+ vuint4::store(ptr + 4,ih);
+ }
+
+ static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+ for (size_t i=0;i<8;i++)
+ ptr[i] = (unsigned short)v[i];
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+ return vuint8(
+ *(unsigned int*)(((char*)ptr)+scale*index[0]),
+ *(unsigned int*)(((char*)ptr)+scale*index[1]),
+ *(unsigned int*)(((char*)ptr)+scale*index[2]),
+ *(unsigned int*)(((char*)ptr)+scale*index[3]),
+ *(unsigned int*)(((char*)ptr)+scale*index[4]),
+ *(unsigned int*)(((char*)ptr)+scale*index[5]),
+ *(unsigned int*)(((char*)ptr)+scale*index[6]),
+ *(unsigned int*)(((char*)ptr)+scale*index[7]));
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+ vuint8 r = zero;
+ if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+ if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+ if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+ if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+ if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
+ if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
+ if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
+ if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
+ return r;
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+ {
+ *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+ {
+ if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+ }
+
+
+ static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+ __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+ __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+ __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); }
+ __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; }
+
+ __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+ __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); }
+ __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; }
+
+ //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+ //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); }
+ //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; }
+
+ __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); }
+ __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; }
+
+ __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); }
+ __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; }
+
+ __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); }
+ __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; }
+
+ __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+ __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+ __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+ __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+ __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+
+ __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+ __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); }
+ __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); }
+
+ __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+ __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); }
+ __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+ __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; }
+
+ __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+ __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; }
+
+ //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+ //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; }
+
+ __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+ __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; }
+
+ __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+ __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; }
+
+ __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+ __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+ _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+ __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); }
+ __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; }
+
+ __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+ __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); }
+ __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; }
+
+ //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+ // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+ //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); }
+ //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; }
+
+ //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); }
+ //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); }
+ //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; }
+
+ //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+ // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+ //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); }
+ //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; }
+
+ //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); }
+ //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); }
+ //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; }
+
+ __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+ __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+ __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+ __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+ __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+ __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+ template<int i>
+ __forceinline vuint8 shuffle(const vuint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1>
+ __forceinline vuint8 shuffle4(const vuint8& v) {
+ return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+ return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint8 shuffle(const vuint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+ return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+ template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+ template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+ template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+ __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+ //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+ //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+ //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+ //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+ //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+ __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+ __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+ //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+ //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+ //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+ //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h
new file mode 100644
index 0000000000..17b994522f
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -0,0 +1,446 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+ /* 8-wide AVX integer type */
+ template<>
+ struct vuint<8>
+ {
+ ALIGNED_STRUCT_(32);
+
+ typedef vboolf8 Bool;
+ typedef vuint8 Int;
+ typedef vfloat8 Float;
+
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256i v;
+ unsigned int i[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint() {}
+ __forceinline vuint(const vuint8& a) { v = a.v; }
+ __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+ __forceinline vuint(__m256i a) : v(a) {}
+ __forceinline operator const __m256i&() const { return v; }
+ __forceinline operator __m256i&() { return v; }
+
+ __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+ __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+
+ __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+ __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+ __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+ __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+ __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {}
+ __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {}
+ __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+ __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+ __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+ __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+ static __forceinline vuint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+ static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+ static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+ static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+ static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+ static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+ static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+ static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+ static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+ return _mm256_mask_compress_epi32(v, mask, v);
+ }
+ static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+ return _mm256_mask_compress_epi32(a, mask, b);
+ }
+
+ static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+ static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+ static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+ static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+ static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+ static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+
+ static __forceinline vuint8 load_nt(void* ptr) {
+ return _mm256_stream_load_si256((__m256i*)ptr);
+ }
+
+ static __forceinline void store_nt(void* ptr, const vuint8& v) {
+ _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+ }
+
+ static __forceinline void store(unsigned char* ptr, const vuint8& i)
+ {
+ for (size_t j=0; j<8; j++)
+ ptr[j] = i[j];
+ }
+
+ static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+ for (size_t i=0;i<8;i++)
+ ptr[i] = (unsigned short)v[i];
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+ return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+ }
+
+ template<int scale = 4>
+ static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+ vuint8 r = zero;
+#if defined(__AVX512VL__)
+ return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+ return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+ *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ template<int scale = 4>
+ static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+ {
+#if defined(__AVX512VL__)
+ _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+ if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+ if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+ if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+ if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+ if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+ if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+ if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+ if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+ }
+
+ static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+ __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; }
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Unary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+ __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+ __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Binary Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+ __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); }
+ __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; }
+
+ __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+ __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); }
+ __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; }
+
+ //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+ //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); }
+ //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; }
+
+ __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+ __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); }
+ __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; }
+
+ __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+ __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); }
+ __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; }
+
+ __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+ __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); }
+ __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; }
+
+ __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+ __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+ __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+ __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+ __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+ __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+ __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+ __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+ __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+ __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+
+ __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+ __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); }
+ __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); }
+
+ __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+ __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); }
+ __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Assignment Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+ __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; }
+
+ __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+ __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; }
+
+ //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+ //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; }
+
+ __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+ __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; }
+
+ __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+ __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; }
+
+ __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+ __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Comparison Operators + Select
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+ __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+ __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+ __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+ __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+ __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+ __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+ return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+ }
+#else
+ __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+ __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+ //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+ //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); }
+ //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+ //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); }
+
+ __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+ }
+#endif
+
+ template<int mask>
+ __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+ return _mm256_blend_epi32(f, t, mask);
+ }
+
+ __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); }
+ __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; }
+
+ __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); }
+ __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; }
+
+ //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); }
+ //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; }
+
+ //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); }
+ //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; }
+
+ //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); }
+ //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; }
+
+ //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); }
+ //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; }
+
+ __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+ __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+ //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; }
+ //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+ //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; }
+ //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+ __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+ __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+ __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+ __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+ __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+ __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+ __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+ __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+ //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); }
+ //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+ //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); }
+ //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Movement/Shifting/Shuffling Functions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+ __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+ template<int i>
+ __forceinline vuint8 shuffle(const vuint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+ }
+
+ template<int i0, int i1>
+ __forceinline vuint8 shuffle4(const vuint8& v) {
+ return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1>
+ __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+ return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint8 shuffle(const vuint8& v) {
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<int i0, int i1, int i2, int i3>
+ __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+ return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+ }
+
+ template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+ template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+ template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+ template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+ template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+ __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+ __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+ return _mm256_permutevar8x32_epi32(v, index);
+ }
+
+ __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+ return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+ }
+
+ template<int i>
+ __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+ return _mm256_alignr_epi32(a, b, i);
+#else
+ return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+ //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+ //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+ //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+ //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+ //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+ __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+ __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+ __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+ //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+ //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+ __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+ //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+ //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+ //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+ //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Output Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+ return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+ }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
new file mode 100644
index 0000000000..abdd269069
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -0,0 +1,327 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+ void* alignedMalloc(size_t size, size_t align)
+ {
+ if (size == 0)
+ return nullptr;
+
+ assert((align & (align-1)) == 0);
+ void* ptr = _mm_malloc(size,align);
+
+ if (size != 0 && ptr == nullptr)
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
+
+ return ptr;
+ }
+
+ void alignedFree(void* ptr)
+ {
+ if (ptr)
+ _mm_free(ptr);
+ }
+
+ static bool huge_pages_enabled = false;
+ static MutexSys os_init_mutex;
+
+ __forceinline bool isHugePageCandidate(const size_t bytes)
+ {
+ if (!huge_pages_enabled)
+ return false;
+
+ /* use huge pages only when memory overhead is low */
+ const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+ return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+ bool win_enable_selockmemoryprivilege (bool verbose)
+ {
+ HANDLE hToken;
+ if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+ if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+ return false;
+ }
+
+ TOKEN_PRIVILEGES tp;
+ tp.PrivilegeCount = 1;
+ tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+ if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+ if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+ return false;
+ }
+
+ SetLastError(ERROR_SUCCESS);
+ if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+ if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+ return false;
+ }
+
+ if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+ if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+ return false;
+ }
+
+ return true;
+ }
+
+ bool os_init(bool hugepages, bool verbose)
+ {
+ Lock<MutexSys> lock(os_init_mutex);
+
+ if (!hugepages) {
+ huge_pages_enabled = false;
+ return true;
+ }
+
+ if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+ huge_pages_enabled = false;
+ return false;
+ }
+
+ huge_pages_enabled = true;
+ return true;
+ }
+
+ void* os_malloc(size_t bytes, bool& hugepages)
+ {
+ if (bytes == 0) {
+ hugepages = false;
+ return nullptr;
+ }
+
+ /* try direct huge page allocation first */
+ if (isHugePageCandidate(bytes))
+ {
+ int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+ char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+ if (ptr != nullptr) {
+ hugepages = true;
+ return ptr;
+ }
+ }
+
+ /* fall back to 4k pages */
+ int flags = MEM_COMMIT | MEM_RESERVE;
+ char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+ // -- GODOT start --
+ // if (ptr == nullptr) throw std::bad_alloc();
+ if (ptr == nullptr) abort();
+ // -- GODOT end --
+ hugepages = false;
+ return ptr;
+ }
+
+ size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages)
+ {
+ if (hugepages) // decommitting huge pages seems not to work under Windows
+ return bytesOld;
+
+ const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+ bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+ bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+ if (bytesNew >= bytesOld)
+ return bytesOld;
+
+ if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
+
+ return bytesNew;
+ }
+
+ void os_free(void* ptr, size_t bytes, bool hugepages)
+ {
+ if (bytes == 0)
+ return;
+
+ if (!VirtualFree(ptr,0,MEM_RELEASE))
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
+ }
+
+ void os_advise(void *ptr, size_t bytes)
+ {
+ }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+ bool os_init(bool hugepages, bool verbose)
+ {
+ Lock<MutexSys> lock(os_init_mutex);
+
+ if (!hugepages) {
+ huge_pages_enabled = false;
+ return true;
+ }
+
+#if defined(__LINUX__)
+
+ int hugepagesize = 0;
+
+ std::ifstream file;
+ file.open("/proc/meminfo",std::ios::in);
+ if (!file.is_open()) {
+ if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+ huge_pages_enabled = false;
+ return false;
+ }
+
+ std::string line;
+ while (getline(file,line))
+ {
+ std::stringstream sline(line);
+ while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+ std::string tag; getline(sline,tag,' ');
+ while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+ std::string val; getline(sline,val,' ');
+ while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+ std::string unit; getline(sline,unit,' ');
+ if (tag == "Hugepagesize:" && unit == "kB") {
+ hugepagesize = std::stoi(val)*1024;
+ break;
+ }
+ }
+
+ if (hugepagesize != PAGE_SIZE_2M)
+ {
+ if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+ huge_pages_enabled = false;
+ return false;
+ }
+#endif
+
+ huge_pages_enabled = true;
+ return true;
+ }
+
+ void* os_malloc(size_t bytes, bool& hugepages)
+ {
+ if (bytes == 0) {
+ hugepages = false;
+ return nullptr;
+ }
+
+ /* try direct huge page allocation first */
+ if (isHugePageCandidate(bytes))
+ {
+#if defined(__MACOSX__)
+ void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+ if (ptr != MAP_FAILED) {
+ hugepages = true;
+ return ptr;
+ }
+#elif defined(MAP_HUGETLB)
+ void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+ if (ptr != MAP_FAILED) {
+ hugepages = true;
+ return ptr;
+ }
+#endif
+ }
+
+ /* fallback to 4k pages */
+ void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ // -- GODOT start --
+ // if (ptr == MAP_FAILED) throw std::bad_alloc();
+ if (ptr == MAP_FAILED) abort();
+ // -- GODOT end --
+ hugepages = false;
+
+ /* advise huge page hint for THP */
+ os_advise(ptr,bytes);
+ return ptr;
+ }
+
+ size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages)
+ {
+ const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+ bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+ bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+ if (bytesNew >= bytesOld)
+ return bytesOld;
+
+ if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
+
+ return bytesNew;
+ }
+
+ void os_free(void* ptr, size_t bytes, bool hugepages)
+ {
+ if (bytes == 0)
+ return;
+
+ /* for hugepages we need to also align the size */
+ const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+ bytes = (bytes+pageSize-1) & ~(pageSize-1);
+ if (munmap(ptr,bytes) == -1)
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
+ }
+
+ /* hint for transparent huge pages (THP) */
+ void os_advise(void* pptr, size_t bytes)
+ {
+#if defined(MADV_HUGEPAGE)
+ madvise(pptr,bytes,MADV_HUGEPAGE);
+#endif
+ }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h
new file mode 100644
index 0000000000..4fa474ec1d
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align) \
+ void* operator new(size_t size) { return alignedMalloc(size,align); } \
+ void operator delete(void* ptr) { alignedFree(ptr); } \
+ void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+ void operator delete[](void* ptr) { alignedFree(ptr); }
+
+#define ALIGNED_CLASS_(align) \
+ public: \
+ ALIGNED_STRUCT_(align) \
+ private:
+
+ /*! aligned allocation */
+ void* alignedMalloc(size_t size, size_t align);
+ void alignedFree(void* ptr);
+
+ /*! allocator that performs aligned allocations */
+ template<typename T, size_t alignment>
+ struct aligned_allocator
+ {
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ __forceinline pointer allocate( size_type n ) {
+ return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+ }
+
+ __forceinline void deallocate( pointer p, size_type n ) {
+ return alignedFree(p);
+ }
+
+ __forceinline void construct( pointer p, const_reference val ) {
+ new (p) T(val);
+ }
+
+ __forceinline void destroy( pointer p ) {
+ p->~T();
+ }
+ };
+
+ /*! allocates pages directly from OS */
+ bool win_enable_selockmemoryprivilege(bool verbose);
+ bool os_init(bool hugepages, bool verbose);
+ void* os_malloc (size_t bytes, bool& hugepages);
+ size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+ void os_free (void* ptr, size_t bytes, bool hugepages);
+ void os_advise (void* ptr, size_t bytes);
+
+ /*! allocator that performs OS allocations */
+ template<typename T>
+ struct os_allocator
+ {
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ __forceinline os_allocator ()
+ : hugepages(false) {}
+
+ __forceinline pointer allocate( size_type n ) {
+ return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+ }
+
+ __forceinline void deallocate( pointer p, size_type n ) {
+ return os_free(p,n*sizeof(value_type),hugepages);
+ }
+
+ __forceinline void construct( pointer p, const_reference val ) {
+ new (p) T(val);
+ }
+
+ __forceinline void destroy( pointer p ) {
+ p->~T();
+ }
+
+ bool hugepages;
+ };
+
+ /*! allocator for IDs */
+ template<typename T, size_t max_id>
+ struct IDPool
+ {
+ typedef T value_type;
+
+ IDPool ()
+ : nextID(0) {}
+
+ T allocate()
+ {
+ /* return ID from list */
+ if (!IDs.empty())
+ {
+ T id = *IDs.begin();
+ IDs.erase(IDs.begin());
+ return id;
+ }
+
+ /* allocate new ID */
+ else
+ {
+ if (size_t(nextID)+1 > max_id)
+ return -1;
+
+ return nextID++;
+ }
+ }
+
+ /* adds an ID provided by the user */
+ bool add(T id)
+ {
+ if (id > max_id)
+ return false;
+
+ /* check if ID should be in IDs set */
+ if (id < nextID) {
+ auto p = IDs.find(id);
+ if (p == IDs.end()) return false;
+ IDs.erase(p);
+ return true;
+ }
+
+ /* otherwise increase ID set */
+ else
+ {
+ for (T i=nextID; i<id; i++) {
+ IDs.insert(i);
+ }
+ nextID = id+1;
+ return true;
+ }
+ }
+
+ void deallocate( T id )
+ {
+ assert(id < nextID);
+ MAYBE_UNUSED auto done = IDs.insert(id).second;
+ assert(done);
+ }
+
+ private:
+ std::set<T> IDs; //!< stores deallocated IDs to be reused
+ T nextID; //!< next ID to use when IDs vector is empty
+ };
+}
+
diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h
new file mode 100644
index 0000000000..dd9190c52a
--- /dev/null
+++ b/thirdparty/embree/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+ /*! static array with static size */
+ template<typename T, size_t N>
+ class array_t
+ {
+ public:
+
+ /********************** Iterators ****************************/
+
+ __forceinline T* begin() const { return items; };
+ __forceinline T* end () const { return items+N; };
+
+
+ /********************** Capacity ****************************/
+
+ __forceinline bool empty () const { return N == 0; }
+ __forceinline size_t size () const { return N; }
+ __forceinline size_t max_size () const { return N; }
+
+
+ /******************** Element access **************************/
+
+ __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; }
+ __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+ __forceinline T& at(size_t i) { assert(i < N); return items[i]; }
+ __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+ __forceinline T& front() const { assert(N > 0); return items[0]; };
+ __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+ __forceinline T* data() { return items; };
+ __forceinline const T* data() const { return items; };
+
+ private:
+ T items[N];
+ };
+
+ /*! static array with dynamic size */
+ template<typename T, size_t N>
+ class darray_t
+ {
+ public:
+
+ __forceinline darray_t () : M(0) {}
+
+ __forceinline darray_t (const T& v) : M(0) {
+ for (size_t i=0; i<N; i++) items[i] = v;
+ }
+
+ /********************** Iterators ****************************/
+
+ __forceinline T* begin() const { return items; };
+ __forceinline T* end () const { return items+M; };
+
+
+ /********************** Capacity ****************************/
+
+ __forceinline bool empty () const { return M == 0; }
+ __forceinline size_t size () const { return M; }
+ __forceinline size_t capacity () const { return N; }
+ __forceinline size_t max_size () const { return N; }
+
+ void resize(size_t new_size) {
+ assert(new_size < max_size());
+ M = new_size;
+ }
+
+ /******************** Modifiers **************************/
+
+ __forceinline void push_back(const T& v)
+ {
+ assert(M+1 < max_size());
+ items[M++] = v;
+ }
+
+ __forceinline void pop_back()
+ {
+ assert(!empty());
+ M--;
+ }
+
+ __forceinline void clear() {
+ M = 0;
+ }
+
+ /******************** Element access **************************/
+
+ __forceinline T& operator[](size_t i) { assert(i < M); return items[i]; }
+ __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+ __forceinline T& at(size_t i) { assert(i < M); return items[i]; }
+ __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+ __forceinline T& front() const { assert(M > 0); return items[0]; };
+ __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+
+ __forceinline T* data() { return items; };
+ __forceinline const T* data() const { return items; };
+
+ private:
+ size_t M;
+ T items[N];
+ };
+
+ /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+ template<typename Ty, size_t max_stack_bytes>
+ struct __aligned(64) StackArray
+ {
+ __forceinline StackArray (const size_t N)
+ : N(N)
+ {
+ if (N*sizeof(Ty) <= max_stack_bytes)
+ data = &arr[0];
+ else
+ data = (Ty*) alignedMalloc(N*sizeof(Ty),64);
+ }
+
+ __forceinline ~StackArray () {
+ if (data != &arr[0]) alignedFree(data);
+ }
+
+ __forceinline operator Ty* () { return data; }
+ __forceinline operator const Ty* () const { return data; }
+
+ __forceinline Ty& operator[](const int i) { assert(i>=0 && i<N); return data[i]; }
+ __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+ __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; }
+ __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__64BIT__)
+ __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; }
+ __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+ private:
+ Ty arr[max_stack_bytes/sizeof(Ty)];
+ Ty* data;
+ size_t N;
+
+ private:
+ StackArray (const StackArray& other) DELETED; // do not implement
+ StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+ };
+
+ /*! dynamic sized array that is allocated on the stack */
+ template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+ struct __aligned(64) DynamicStackArray
+ {
+ __forceinline DynamicStackArray ()
+ : data(&arr[0]) {}
+
+ __forceinline ~DynamicStackArray ()
+ {
+ if (!isStackAllocated())
+ delete[] data;
+ }
+
+ __forceinline bool isStackAllocated() const {
+ return data == &arr[0];
+ }
+
+ __forceinline size_t size() const
+ {
+ if (isStackAllocated()) return max_stack_elements;
+ else return max_total_elements;
+ }
+
+ __forceinline void resize(size_t M)
+ {
+ assert(M <= max_total_elements);
+ if (likely(M <= max_stack_elements)) return;
+ if (likely(!isStackAllocated())) return;
+
+ data = new Ty[max_total_elements];
+
+ for (size_t i=0; i<max_stack_elements; i++)
+ data[i] = arr[i];
+ }
+
+ __forceinline operator Ty* () { return data; }
+ __forceinline operator const Ty* () const { return data; }
+
+ __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+ __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__64BIT__)
+ __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+ __forceinline DynamicStackArray (const DynamicStackArray& other)
+ : data(&arr[0])
+ {
+ for (size_t i=0; i<other.size(); i++)
+ this->operator[] (i) = other[i];
+ }
+
+ DynamicStackArray& operator= (const DynamicStackArray& other)
+ {
+ for (size_t i=0; i<other.size(); i++)
+ this->operator[] (i) = other[i];
+
+ return *this;
+ }
+
+ private:
+ Ty arr[max_stack_elements];
+ Ty* data;
+ };
+}
diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h
new file mode 100644
index 0000000000..67af254f36
--- /dev/null
+++ b/thirdparty/embree/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+# define __memory_barrier() asm volatile("" ::: "memory")
+#elif defined(_MSC_VER)
+# define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+ template <typename T>
+ struct atomic : public std::atomic<T>
+ {
+ atomic () {}
+
+ atomic (const T& a)
+ : std::atomic<T>(a) {}
+
+ atomic (const atomic<T>& a) {
+ this->store(a.load());
+ }
+
+ atomic& operator=(const atomic<T>& other) {
+ this->store(other.load());
+ return *this;
+ }
+ };
+
+ template<typename T>
+ __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+ {
+ const T b = bref.load();
+ while (true) {
+ T a = aref.load();
+ if (a <= b) break;
+ if (aref.compare_exchange_strong(a,b)) break;
+ }
+ }
+
+ template<typename T>
+ __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+ {
+ const T b = bref.load();
+ while (true) {
+ T a = aref.load();
+ if (a >= b) break;
+ if (aref.compare_exchange_strong(a,b)) break;
+ }
+ }
+}
diff --git a/thirdparty/embree/common/sys/barrier.cpp b/thirdparty/embree/common/sys/barrier.cpp
new file mode 100644
index 0000000000..0c0e39d92d
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+ struct BarrierSysImplementation
+ {
+ __forceinline BarrierSysImplementation (size_t N)
+ : i(0), enterCount(0), exitCount(0), barrierSize(0)
+ {
+ events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+ events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+ init(N);
+ }
+
+ __forceinline ~BarrierSysImplementation ()
+ {
+ CloseHandle(events[0]);
+ CloseHandle(events[1]);
+ }
+
+ __forceinline void init(size_t N)
+ {
+ barrierSize = N;
+ enterCount.store(N);
+ exitCount.store(N);
+ }
+
+ __forceinline void wait()
+ {
+ /* every thread entering the barrier decrements this count */
+ size_t i0 = i;
+ size_t cnt0 = enterCount--;
+
+ /* all threads except the last one are wait in the barrier */
+ if (cnt0 > 1)
+ {
+ if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+ THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+ }
+
+ /* the last thread starts all threads waiting at the barrier */
+ else
+ {
+ i = 1-i;
+ enterCount.store(barrierSize);
+ if (SetEvent(events[i0]) == 0)
+ THROW_RUNTIME_ERROR("SetEvent failed");
+ }
+
+ /* every thread leaving the barrier decrements this count */
+ size_t cnt1 = exitCount--;
+
+ /* the last thread that left the barrier resets the event again */
+ if (cnt1 == 1)
+ {
+ exitCount.store(barrierSize);
+ if (ResetEvent(events[i0]) == 0)
+ THROW_RUNTIME_ERROR("ResetEvent failed");
+ }
+ }
+
+ public:
+ HANDLE events[2];
+ atomic<size_t> i;
+ atomic<size_t> enterCount;
+ atomic<size_t> exitCount;
+ size_t barrierSize;
+ };
+}
+
+#else
+
+namespace embree
+{
+ struct BarrierSysImplementation
+ {
+ __forceinline BarrierSysImplementation (size_t N)
+ : count(0), barrierSize(0)
+ {
+ init(N);
+ }
+
+ __forceinline void init(size_t N)
+ {
+ assert(count == 0);
+ count = 0;
+ barrierSize = N;
+ }
+
+ __forceinline void wait()
+ {
+ mutex.lock();
+ count++;
+
+ if (count == barrierSize) {
+ count = 0;
+ cond.notify_all();
+ mutex.unlock();
+ return;
+ }
+
+ cond.wait(mutex);
+ mutex.unlock();
+ return;
+ }
+
+ public:
+ MutexSys mutex;
+ ConditionSys cond;
+ volatile size_t count;
+ volatile size_t barrierSize;
+ };
+}
+
+#endif
+
+namespace embree
+{
+ BarrierSys::BarrierSys (size_t N) {
+ opaque = new BarrierSysImplementation(N);
+ }
+
+ BarrierSys::~BarrierSys () {
+ delete (BarrierSysImplementation*) opaque;
+ }
+
+ void BarrierSys::init(size_t count) {
+ ((BarrierSysImplementation*) opaque)->init(count);
+ }
+
+ void BarrierSys::wait() {
+ ((BarrierSysImplementation*) opaque)->wait();
+ }
+
+ LinearBarrierActive::LinearBarrierActive (size_t N)
+ : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+ {
+ if (N == 0) N = getNumberOfLogicalThreads();
+ init(N);
+ }
+
+ LinearBarrierActive::~LinearBarrierActive()
+ {
+ delete[] count0;
+ delete[] count1;
+ }
+
+ void LinearBarrierActive::init(size_t N)
+ {
+ if (threadCount != N) {
+ threadCount = N;
+ if (count0) delete[] count0; count0 = new unsigned char[N];
+ if (count1) delete[] count1; count1 = new unsigned char[N];
+ }
+ mode = 0;
+ flag0 = 0;
+ flag1 = 0;
+ for (size_t i=0; i<N; i++) count0[i] = 0;
+ for (size_t i=0; i<N; i++) count1[i] = 0;
+ }
+
+ void LinearBarrierActive::wait (const size_t threadIndex)
+ {
+ if (mode == 0)
+ {
+ if (threadIndex == 0)
+ {
+ for (size_t i=0; i<threadCount; i++)
+ count1[i] = 0;
+
+ for (size_t i=1; i<threadCount; i++)
+ {
+ while (likely(count0[i] == 0))
+ pause_cpu();
+ }
+ mode = 1;
+ flag1 = 0;
+ __memory_barrier();
+ flag0 = 1;
+ }
+ else
+ {
+ count0[threadIndex] = 1;
+ {
+ while (likely(flag0 == 0))
+ pause_cpu();
+ }
+
+ }
+ }
+ else
+ {
+ if (threadIndex == 0)
+ {
+ for (size_t i=0; i<threadCount; i++)
+ count0[i] = 0;
+
+ for (size_t i=1; i<threadCount; i++)
+ {
+ while (likely(count1[i] == 0))
+ pause_cpu();
+ }
+
+ mode = 0;
+ flag0 = 0;
+ __memory_barrier();
+ flag1 = 1;
+ }
+ else
+ {
+ count1[threadIndex] = 1;
+ {
+ while (likely(flag1 == 0))
+ pause_cpu();
+ }
+ }
+ }
+ }
+
+ struct barrier_sys_regression_test : public RegressionTest
+ {
+ BarrierSys barrier;
+ std::atomic<size_t> threadID;
+ std::atomic<size_t> numFailed;
+ std::vector<size_t> threadResults;
+
+ barrier_sys_regression_test()
+ : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+ {
+ registerRegressionTest(this);
+ }
+
+ static void thread_alloc(barrier_sys_regression_test* This)
+ {
+ size_t tid = This->threadID++;
+ for (size_t j=0; j<1000; j++)
+ {
+ This->barrier.wait();
+ This->threadResults[tid] = tid;
+ This->barrier.wait();
+ }
+ }
+
+ bool run ()
+ {
+ threadID.store(0);
+ numFailed.store(0);
+
+ size_t numThreads = getNumberOfLogicalThreads();
+ threadResults.resize(numThreads);
+ barrier.init(numThreads+1);
+
+ /* create threads */
+ std::vector<thread_t> threads;
+ for (size_t i=0; i<numThreads; i++)
+ threads.push_back(createThread((thread_func)thread_alloc,this));
+
+ /* run test */
+ for (size_t i=0; i<1000; i++)
+ {
+ for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+ barrier.wait();
+ barrier.wait();
+ for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+ }
+
+ /* destroy threads */
+ for (size_t i=0; i<numThreads; i++)
+ join(threads[i]);
+
+ return numFailed == 0;
+ }
+ };
+
+ barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h
new file mode 100644
index 0000000000..37fc036291
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+ /*! system barrier using operating system */
+ class BarrierSys
+ {
+ public:
+
+ /*! construction / destruction */
+ BarrierSys (size_t N = 0);
+ ~BarrierSys ();
+
+ private:
+ /*! class in non-copyable */
+ BarrierSys (const BarrierSys& other) DELETED; // do not implement
+ BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+ public:
+ /*! intializes the barrier with some number of threads */
+ void init(size_t count);
+
+ /*! lets calling thread wait in barrier */
+ void wait();
+
+ private:
+ void* opaque;
+ };
+
+ /*! fast active barrier using atomitc counter */
+ struct BarrierActive
+ {
+ public:
+ BarrierActive ()
+ : cntr(0) {}
+
+ void reset() {
+ cntr.store(0);
+ }
+
+ void wait (size_t numThreads)
+ {
+ cntr++;
+ while (cntr.load() != numThreads)
+ pause_cpu();
+ }
+
+ private:
+ std::atomic<size_t> cntr;
+ };
+
+ /*! fast active barrier that does not require initialization to some number of threads */
+ struct BarrierActiveAutoReset
+ {
+ public:
+ BarrierActiveAutoReset ()
+ : cntr0(0), cntr1(0) {}
+
+ void wait (size_t threadCount)
+ {
+ cntr0.fetch_add(1);
+ while (cntr0 != threadCount) pause_cpu();
+ cntr1.fetch_add(1);
+ while (cntr1 != threadCount) pause_cpu();
+ cntr0.fetch_add(-1);
+ while (cntr0 != 0) pause_cpu();
+ cntr1.fetch_add(-1);
+ while (cntr1 != 0) pause_cpu();
+ }
+
+ private:
+ std::atomic<size_t> cntr0;
+ std::atomic<size_t> cntr1;
+ };
+
+ class LinearBarrierActive
+ {
+ public:
+
+ /*! construction and destruction */
+ LinearBarrierActive (size_t threadCount = 0);
+ ~LinearBarrierActive();
+
+ private:
+ /*! class in non-copyable */
+ LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+ LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+ public:
+ /*! intializes the barrier with some number of threads */
+ void init(size_t threadCount);
+
+ /*! thread with threadIndex waits in the barrier */
+ void wait (const size_t threadIndex);
+
+ private:
+ volatile unsigned char* count0;
+ volatile unsigned char* count1;
+ volatile unsigned int mode;
+ volatile unsigned int flag0;
+ volatile unsigned int flag1;
+ volatile size_t threadCount;
+ };
+}
+
diff --git a/thirdparty/embree/common/sys/condition.cpp b/thirdparty/embree/common/sys/condition.cpp
new file mode 100644
index 0000000000..606a1d0b04
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+ struct ConditionImplementation
+ {
+ __forceinline ConditionImplementation () {
+ InitializeConditionVariable(&cond);
+ }
+
+ __forceinline ~ConditionImplementation () {
+ }
+
+ __forceinline void wait(MutexSys& mutex_in) {
+ SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+ }
+
+ __forceinline void notify_all() {
+ WakeAllConditionVariable(&cond);
+ }
+
+ public:
+ CONDITION_VARIABLE cond;
+ };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+ struct ConditionImplementation
+ {
+ __forceinline ConditionImplementation () {
+ if (pthread_cond_init(&cond,nullptr) != 0)
+ THROW_RUNTIME_ERROR("pthread_cond_init failed");
+ }
+
+ __forceinline ~ConditionImplementation() {
+ MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0;
+ assert(ok);
+ }
+
+ __forceinline void wait(MutexSys& mutex) {
+ if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0)
+ THROW_RUNTIME_ERROR("pthread_cond_wait failed");
+ }
+
+ __forceinline void notify_all() {
+ if (pthread_cond_broadcast(&cond) != 0)
+ THROW_RUNTIME_ERROR("pthread_cond_broadcast failed");
+ }
+
+ public:
+ pthread_cond_t cond;
+ };
+}
+#endif
+
+namespace embree
+{
+ ConditionSys::ConditionSys () {
+ cond = new ConditionImplementation;
+ }
+
+ ConditionSys::~ConditionSys() {
+ delete (ConditionImplementation*) cond;
+ }
+
+ void ConditionSys::wait(MutexSys& mutex) {
+ ((ConditionImplementation*) cond)->wait(mutex);
+ }
+
+ void ConditionSys::notify_all() {
+ ((ConditionImplementation*) cond)->notify_all();
+ }
+}
diff --git a/thirdparty/embree/common/sys/condition.h b/thirdparty/embree/common/sys/condition.h
new file mode 100644
index 0000000000..557c6e3482
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+ class ConditionSys
+ {
+ public:
+ ConditionSys();
+ ~ConditionSys();
+ void wait( class MutexSys& mutex );
+ void notify_all();
+
+ template<typename Predicate>
+ __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+ {
+ while (!pred()) wait(mutex);
+ }
+
+ private:
+ ConditionSys (const ConditionSys& other) DELETED; // do not implement
+ ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+ protected:
+ void* cond;
+ };
+}
diff --git a/thirdparty/embree/common/sys/filename.cpp b/thirdparty/embree/common/sys/filename.cpp
new file mode 100644
index 0000000000..f55b224302
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.cpp
@@ -0,0 +1,138 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+ const char path_sep = '\\';
+#else
+ const char path_sep = '/';
+#endif
+
+ /*! create an empty filename */
+ FileName::FileName () {}
+
+ /*! create a valid filename from a string */
+ FileName::FileName (const char* in) {
+ filename = in;
+ for (size_t i=0; i<filename.size(); i++)
+ if (filename[i] == '\\' || filename[i] == '/')
+ filename[i] = path_sep;
+ while (!filename.empty() && filename[filename.size()-1] == path_sep)
+ filename.resize(filename.size()-1);
+ }
+
+ /*! create a valid filename from a string */
+ FileName::FileName (const std::string& in) {
+ filename = in;
+ for (size_t i=0; i<filename.size(); i++)
+ if (filename[i] == '\\' || filename[i] == '/')
+ filename[i] = path_sep;
+ while (!filename.empty() && filename[filename.size()-1] == path_sep)
+ filename.resize(filename.size()-1);
+ }
+
+ /*! returns path to home folder */
+ FileName FileName::homeFolder()
+ {
+#ifdef __WIN32__
+ const char* home = getenv("UserProfile");
+#else
+ const char* home = getenv("HOME");
+#endif
+ if (home) return home;
+ return "";
+ }
+
+ /*! returns path to executable */
+ FileName FileName::executableFolder() {
+ return FileName(getExecutableFileName()).path();
+ }
+
+ /*! returns the path */
+ FileName FileName::path() const {
+ size_t pos = filename.find_last_of(path_sep);
+ if (pos == std::string::npos) return FileName();
+ return filename.substr(0,pos);
+ }
+
+ /*! returns the basename */
+ std::string FileName::base() const {
+ size_t pos = filename.find_last_of(path_sep);
+ if (pos == std::string::npos) return filename;
+ return filename.substr(pos+1);
+ }
+
+ /*! returns the extension */
+ std::string FileName::ext() const {
+ size_t pos = filename.find_last_of('.');
+ if (pos == std::string::npos) return "";
+ return filename.substr(pos+1);
+ }
+
+ /*! returns the extension */
+ FileName FileName::dropExt() const {
+ size_t pos = filename.find_last_of('.');
+ if (pos == std::string::npos) return filename;
+ return filename.substr(0,pos);
+ }
+
+ /*! returns the basename without extension */
+ std::string FileName::name() const {
+ size_t start = filename.find_last_of(path_sep);
+ if (start == std::string::npos) start = 0; else start++;
+ size_t end = filename.find_last_of('.');
+ if (end == std::string::npos || end < start) end = filename.size();
+ return filename.substr(start, end - start);
+ }
+
+ /*! replaces the extension */
+ FileName FileName::setExt(const std::string& ext) const {
+ size_t start = filename.find_last_of(path_sep);
+ if (start == std::string::npos) start = 0; else start++;
+ size_t end = filename.find_last_of('.');
+ if (end == std::string::npos || end < start) return FileName(filename+ext);
+ return FileName(filename.substr(0,end)+ext);
+ }
+
+ /*! adds the extension */
+ FileName FileName::addExt(const std::string& ext) const {
+ return FileName(filename+ext);
+ }
+
+ /*! concatenates two filenames to this/other */
+ FileName FileName::operator +( const FileName& other ) const {
+ if (filename == "") return FileName(other);
+ else return FileName(filename + path_sep + other.filename);
+ }
+
+ /*! concatenates two filenames to this/other */
+ FileName FileName::operator +( const std::string& other ) const {
+ return operator+(FileName(other));
+ }
+
+ /*! removes the base from a filename (if possible) */
+ FileName FileName::operator -( const FileName& base ) const {
+ size_t pos = filename.find_first_of(base);
+ if (pos == std::string::npos) return *this;
+ return FileName(filename.substr(pos+1));
+ }
+
+ /*! == operator */
+ bool operator== (const FileName& a, const FileName& b) {
+ return a.filename == b.filename;
+ }
+
+ /*! != operator */
+ bool operator!= (const FileName& a, const FileName& b) {
+ return a.filename != b.filename;
+ }
+
+ /*! output operator */
+ std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+ return cout << filename.filename;
+ }
+}
diff --git a/thirdparty/embree/common/sys/filename.h b/thirdparty/embree/common/sys/filename.h
new file mode 100644
index 0000000000..d5929cd836
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.h
@@ -0,0 +1,81 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+ /*! Convenience class for handling file names and paths. */
+ class FileName
+ {
+ public:
+
+ /*! create an empty filename */
+ FileName ();
+
+ /*! create a valid filename from a string */
+ FileName (const char* filename);
+
+ /*! create a valid filename from a string */
+ FileName (const std::string& filename);
+
+ /*! returns path to home folder */
+ static FileName homeFolder();
+
+ /*! returns path to executable */
+ static FileName executableFolder();
+
+ /*! auto convert into a string */
+ operator std::string() const { return filename; }
+
+ /*! returns a string of the filename */
+ const std::string str() const { return filename; }
+
+ /*! returns a c-string of the filename */
+ const char* c_str() const { return filename.c_str(); }
+
+ /*! returns the path of a filename */
+ FileName path() const;
+
+ /*! returns the file of a filename */
+ std::string base() const;
+
+ /*! returns the base of a filename without extension */
+ std::string name() const;
+
+ /*! returns the file extension */
+ std::string ext() const;
+
+ /*! drops the file extension */
+ FileName dropExt() const;
+
+ /*! replaces the file extension */
+ FileName setExt(const std::string& ext = "") const;
+
+ /*! adds file extension */
+ FileName addExt(const std::string& ext = "") const;
+
+ /*! concatenates two filenames to this/other */
+ FileName operator +( const FileName& other ) const;
+
+ /*! concatenates two filenames to this/other */
+ FileName operator +( const std::string& other ) const;
+
+ /*! removes the base from a filename (if possible) */
+ FileName operator -( const FileName& base ) const;
+
+ /*! == operator */
+ friend bool operator==(const FileName& a, const FileName& b);
+
+ /*! != operator */
+ friend bool operator!=(const FileName& a, const FileName& b);
+
+ /*! output operator */
+ friend std::ostream& operator<<(std::ostream& cout, const FileName& filename);
+
+ private:
+ std::string filename;
+ };
+}
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
new file mode 100644
index 0000000000..ed8dd7d40a
--- /dev/null
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <immintrin.h>
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+ #if !defined(_tzcnt_u32)
+ #define _tzcnt_u32 __tzcnt_u32
+ #endif
+ #if !defined(_tzcnt_u64)
+ #define _tzcnt_u64 __tzcnt_u64
+ #endif
+#endif
+
+#if defined(__LZCNT__)
+ #if !defined(_lzcnt_u32)
+ #define _lzcnt_u32 __lzcnt32
+ #endif
+ #if !defined(_lzcnt_u64)
+ #define _lzcnt_u64 __lzcnt64
+ #endif
+#endif
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#define NOMINMAX
+// -- GODOT start --
+#endif
+#include "windows.h"
+// -- GODOT end --
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON (0x0040)
+#define _MM_DENORMALS_ZERO_OFF (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+ __forceinline size_t read_tsc()
+ {
+ LARGE_INTEGER li;
+ QueryPerformanceCounter(&li);
+ return (size_t)li.QuadPart;
+ }
+
+ __forceinline int bsf(int v) {
+#if defined(__AVX2__)
+ return _tzcnt_u32(v);
+#else
+ unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+ }
+
+ __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__)
+ return _tzcnt_u32(v);
+#else
+ unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+ }
+
+#if defined(__X86_64__)
+ __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+ return _tzcnt_u64(v);
+#else
+ unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+ }
+#endif
+
+ __forceinline int bscf(int& v)
+ {
+ int i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+
+ __forceinline unsigned bscf(unsigned& v)
+ {
+ unsigned i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+
+#if defined(__X86_64__)
+ __forceinline size_t bscf(size_t& v)
+ {
+ size_t i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+#endif
+
+ __forceinline int bsr(int v) {
+#if defined(__AVX2__)
+ return 31 - _lzcnt_u32(v);
+#else
+ unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+ }
+
+ __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__)
+ return 31 - _lzcnt_u32(v);
+#else
+ unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+ }
+
+#if defined(__X86_64__)
+ __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+ return 63 -_lzcnt_u64(v);
+#else
+ unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+ }
+#endif
+
+ __forceinline int lzcnt(const int x)
+ {
+#if defined(__AVX2__)
+ return _lzcnt_u32(x);
+#else
+ if (unlikely(x == 0)) return 32;
+ return 31 - bsr(x);
+#endif
+ }
+
+ __forceinline int btc(int v, int i) {
+ long r = v; _bittestandcomplement(&r,i); return r;
+ }
+
+ __forceinline int bts(int v, int i) {
+ long r = v; _bittestandset(&r,i); return r;
+ }
+
+ __forceinline int btr(int v, int i) {
+ long r = v; _bittestandreset(&r,i); return r;
+ }
+
+#if defined(__X86_64__)
+
+ __forceinline size_t btc(size_t v, size_t i) {
+ size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+ }
+
+ __forceinline size_t bts(size_t v, size_t i) {
+ __int64 r = v; _bittestandset64(&r,i); return r;
+ }
+
+ __forceinline size_t btr(size_t v, size_t i) {
+ __int64 r = v; _bittestandreset64(&r,i); return r;
+ }
+
+#endif
+
+ __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+ return _InterlockedCompareExchange((volatile long*)p,v,c);
+ }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#else
+
+#if defined(__i386__) && defined(__PIC__)
+
+ __forceinline void __cpuid(int out[4], int op)
+ {
+ asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+ "cpuid\n\t"
+ "xchg{l}\t{%%}ebx, %1\n\t"
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "0"(op));
+ }
+
+ __forceinline void __cpuid_count(int out[4], int op1, int op2)
+ {
+ asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+ "cpuid\n\t"
+ "xchg{l}\t{%%}ebx, %1\n\t"
+ : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+ : "0" (op1), "2" (op2));
+ }
+
+#elif defined(__X86_ASM__)
+
+ __forceinline void __cpuid(int out[4], int op) {
+ asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+ }
+
+ __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+ asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
+ }
+
+#endif
+
+ __forceinline uint64_t read_tsc() {
+#if defined(__X86_ASM__)
+ uint32_t high,low;
+ asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+ return (((uint64_t)high) << 32) + (uint64_t)low;
+#else
+ /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
+ return 0;
+#endif
+ }
+
+ __forceinline int bsf(int v) {
+#if defined(__AVX2__)
+ return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+ int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return __builtin_ctz(v);
+#endif
+ }
+
+#if defined(__64BIT__)
+ __forceinline unsigned bsf(unsigned v)
+ {
+#if defined(__AVX2__)
+ return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+ unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return __builtin_ctz(v);
+#endif
+ }
+#endif
+
+ __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+ return _tzcnt_u64(v);
+#else
+ return _tzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+ size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return __builtin_ctzl(v);
+#endif
+ }
+
+ __forceinline int bscf(int& v)
+ {
+ int i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+
+#if defined(__64BIT__)
+ __forceinline unsigned int bscf(unsigned int& v)
+ {
+ unsigned int i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+#endif
+
+ __forceinline size_t bscf(size_t& v)
+ {
+ size_t i = bsf(v);
+ v &= v-1;
+ return i;
+ }
+
+ __forceinline int bsr(int v) {
+#if defined(__AVX2__)
+ return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+ int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return __builtin_clz(v) ^ 31;
+#endif
+ }
+
+#if defined(__64BIT__)
+ __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__)
+ return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+ unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return __builtin_clz(v) ^ 31;
+#endif
+ }
+#endif
+
+ __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+ return 63 - _lzcnt_u64(v);
+#else
+ return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+ size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+ return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#endif
+ }
+
+ __forceinline int lzcnt(const int x)
+ {
+#if defined(__AVX2__)
+ return _lzcnt_u32(x);
+#else
+ if (unlikely(x == 0)) return 32;
+ return 31 - bsr(x);
+#endif
+ }
+
+ __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__)
+#if defined(__INTEL_COMPILER)
+ return _blsr_u64(v);
+#else
+#if defined(__X86_64__)
+ return __blsr_u64(v);
+#else
+ return __blsr_u32(v);
+#endif
+#endif
+#else
+ return v & (v-1);
+#endif
+ }
+
+ __forceinline int btc(int v, int i) {
+#if defined(__X86_ASM__)
+ int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+ return (v ^ (1 << i));
+#endif
+ }
+
+ __forceinline int bts(int v, int i) {
+#if defined(__X86_ASM__)
+ int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+ return (v | (v << i));
+#endif
+ }
+
+ __forceinline int btr(int v, int i) {
+#if defined(__X86_ASM__)
+ int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+ return (v & ~(v << i));
+#endif
+ }
+
+ __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+ size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+ return (v ^ (1 << i));
+#endif
+ }
+
+ __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+ size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+ return (v | (v << i));
+#endif
+ }
+
+ __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+ size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+ return (v & ~(v << i));
+#endif
+ }
+
+ __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+ return __sync_val_compare_and_swap(value, comparand, input);
+ }
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+ __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+ __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+ __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+ __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+ __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+ __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(__SSE4_2__)
+
+ __forceinline int popcnt(int in) {
+ return _mm_popcnt_u32(in);
+ }
+
+ __forceinline unsigned popcnt(unsigned in) {
+ return _mm_popcnt_u32(in);
+ }
+
+#if defined(__64BIT__)
+ __forceinline size_t popcnt(size_t in) {
+ return _mm_popcnt_u64(in);
+ }
+#endif
+
+#endif
+
+#if defined(__X86_ASM__)
+ __forceinline uint64_t rdtsc()
+ {
+ int dummy[4];
+ __cpuid(dummy,0);
+ uint64_t clock = read_tsc();
+ __cpuid(dummy,0);
+ return clock;
+ }
+#endif
+
+ __forceinline void pause_cpu(const size_t N = 8)
+ {
+ for (size_t i=0; i<N; i++)
+ _mm_pause();
+ }
+
+ /* prefetches */
+ __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+ __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+ __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+ __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+ __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+ _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+ _mm_prefetch((const char*)ptr,_MM_HINT_T0);
+#endif
+ }
+
+ __forceinline void prefetchL1EX(const void* ptr) {
+ prefetchEX(ptr);
+ }
+
+ __forceinline void prefetchL2EX(const void* ptr) {
+ prefetchEX(ptr);
+ }
+#if defined(__AVX2__)
+ __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+ __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+ __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+ __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+ __forceinline float mm512_cvtss_f32(__m512 v) {
+ return _mm512_cvtss_f32(v);
+ }
+ __forceinline int mm512_mask2int(__mmask16 k1) {
+ return _mm512_mask2int(k1);
+ }
+ __forceinline __mmask16 mm512_int2mask(int mask) {
+ return _mm512_int2mask(mask);
+ }
+#else
+ __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+ return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+ }
+ __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+ return (int)k1;
+ }
+ __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+ return (__mmask16)mask;
+ }
+#endif
+#endif
+}
diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp
new file mode 100644
index 0000000000..fc983dffd5
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+ /* opens a shared library */
+ lib_t openLibrary(const std::string& file)
+ {
+ std::string fullName = file+".dll";
+ FileName executable = getExecutableFileName();
+ HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+ return lib_t(handle);
+ }
+
+ /* returns address of a symbol from the library */
+ void* getSymbol(lib_t lib, const std::string& sym) {
+ return (void*)GetProcAddress(HMODULE(lib),sym.c_str());
+ }
+
+ /* closes the shared library */
+ void closeLibrary(lib_t lib) {
+ FreeLibrary(HMODULE(lib));
+ }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+ /* opens a shared library */
+ lib_t openLibrary(const std::string& file)
+ {
+#if defined(__MACOSX__)
+ std::string fullName = "lib"+file+".dylib";
+#else
+ std::string fullName = "lib"+file+".so";
+#endif
+ void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+ if (lib) return lib_t(lib);
+ FileName executable = getExecutableFileName();
+ lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+ if (lib == nullptr) {
+ const char* error = dlerror();
+ if (error) {
+ THROW_RUNTIME_ERROR(error);
+ } else {
+ THROW_RUNTIME_ERROR("could not load library "+executable.str());
+ }
+ }
+ return lib_t(lib);
+ }
+
+ /* returns address of a symbol from the library */
+ void* getSymbol(lib_t lib, const std::string& sym) {
+ return dlsym(lib,sym.c_str());
+ }
+
+ /* closes the shared library */
+ void closeLibrary(lib_t lib) {
+ dlclose(lib);
+ }
+}
+#endif
diff --git a/thirdparty/embree/common/sys/library.h b/thirdparty/embree/common/sys/library.h
new file mode 100644
index 0000000000..67e14d2420
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+ /*! type for shared library */
+ typedef struct opaque_lib_t* lib_t;
+
+ /*! loads a shared library */
+ lib_t openLibrary(const std::string& file);
+
+ /*! returns address of a symbol from the library */
+ void* getSymbol(lib_t lib, const std::string& sym);
+
+ /*! unloads a shared library */
+ void closeLibrary(lib_t lib);
+}
diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp
new file mode 100644
index 0000000000..789feaf2d8
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -0,0 +1,57 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+ MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+ MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+ void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+ bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+ void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+ /*! system mutex using pthreads */
+ MutexSys::MutexSys()
+ {
+ mutex = new pthread_mutex_t;
+ if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+ THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+ }
+
+ MutexSys::~MutexSys()
+ {
+ MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+ assert(ok);
+ delete (pthread_mutex_t*)mutex;
+ }
+
+ void MutexSys::lock()
+ {
+ if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0)
+ THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+ }
+
+ bool MutexSys::try_lock() {
+ return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+ }
+
+ void MutexSys::unlock()
+ {
+ if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+ THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+ }
+};
+#endif
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
new file mode 100644
index 0000000000..4cb3626d92
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+namespace embree
+{
+ /*! system mutex */
+ class MutexSys {
+ friend struct ConditionImplementation;
+ public:
+ MutexSys();
+ ~MutexSys();
+
+ private:
+ MutexSys (const MutexSys& other) DELETED; // do not implement
+ MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+ public:
+ void lock();
+ bool try_lock();
+ void unlock();
+
+ protected:
+ void* mutex;
+ };
+
+ /*! spinning mutex */
+ class SpinLock
+ {
+ public:
+
+ SpinLock ()
+ : flag(false) {}
+
+ __forceinline bool isLocked() {
+ return flag.load();
+ }
+
+ __forceinline void lock()
+ {
+ while (true)
+ {
+ while (flag.load())
+ {
+ _mm_pause();
+ _mm_pause();
+ }
+
+ bool expected = false;
+ if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+ break;
+ }
+ }
+
+ __forceinline bool try_lock()
+ {
+ bool expected = false;
+ if (flag.load() != expected) {
+ return false;
+ }
+ return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+ }
+
+ __forceinline void unlock() {
+ flag.store(false,std::memory_order_release);
+ }
+
+ __forceinline void wait_until_unlocked()
+ {
+ while(flag.load())
+ {
+ _mm_pause();
+ _mm_pause();
+ }
+ }
+
+ public:
+ atomic<bool> flag;
+ };
+
+ /*! safe mutex lock and unlock helper */
+ template<typename Mutex> class Lock {
+ public:
+ Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+ Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+ ~Lock() { if (locked) mutex.unlock(); }
+ __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+ __forceinline bool isLocked() const { return locked; }
+ protected:
+ Mutex& mutex;
+ bool locked;
+ };
+}
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
new file mode 100644
index 0000000000..697e07bb86
--- /dev/null
+++ b/thirdparty/embree/common/sys/platform.h
@@ -0,0 +1,392 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 Intel platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#define __X86_ASM__
+#elif defined(__i386__) || defined(_M_IX86)
+#define __X86_ASM__
+#endif
+
+/* detect 64 bit platform */
+#if defined(__X86_64__) || defined(__aarch64__)
+#define __64BIT__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+# if !defined(__LINUX__)
+# define __LINUX__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+# if !defined(__FREEBSD__)
+# define __FREEBSD__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+# if !defined(__WIN32__)
+# define __WIN32__
+# endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+# if !defined(__MACOSX__)
+# define __MACOSX__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define dll_export __declspec(dllexport)
+#define dll_import __declspec(dllimport)
+#else
+#define dll_export __attribute__ ((visibility ("default")))
+#define dll_import
+#endif
+
+// -- GODOT start --
+#if defined(__WIN32__) && !defined(__MINGW32__)
+// -- GODOT end --
+#if !defined(__noinline)
+#define __noinline __declspec(noinline)
+#endif
+//#define __forceinline __forceinline
+//#define __restrict __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__ __restrict
+#else
+#define __restrict__ //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread)
+#define __thread __declspec(thread)
+#endif
+#if !defined(__aligned)
+#define __aligned(...) __declspec(align(__VA_ARGS__))
+#endif
+//#define __FUNCTION__ __FUNCTION__
+#define debugbreak() __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline inline __attribute__((always_inline))
+#endif
+//#define __restrict __restrict
+//#define __thread __thread
+#if !defined(__aligned)
+#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__ __PRETTY_FUNCTION__
+#endif
+#define debugbreak() asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define MAYBE_UNUSED __attribute__((unused))
+#else
+ #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+ #define DELETED
+#else
+ #define DELETED = delete
+#endif
+
+// -- GODOT start --
+#if !defined(likely)
+// -- GODOT end --
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+// -- GODOT start --
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+ #define THROW_RUNTIME_ERROR(str) \
+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
+#else
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(str);
+ #define THROW_RUNTIME_ERROR(str) \
+ abort();
+ // -- GODOT end --
+#endif
+
+#define FATAL(x) THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+ typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__64BIT__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+ return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94 ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+# endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+
+/* embree output stream */
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__)
+#define IACA_SSC_MARK( MARK_ID ) \
+__asm__ __volatile__ ( \
+ "\n\t movl $"#MARK_ID", %%ebx" \
+ "\n\t .byte 0x64, 0x67, 0x90" \
+ : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+ __asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm mov ebx, x\
+ __asm _emit 0x64 \
+ __asm _emit 0x67 \
+ __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+ IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+ IACA_UD_BYTES}
+
+namespace embree
+{
+ template<typename Closure>
+ struct OnScopeExitHelper
+ {
+ OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+ ~OnScopeExitHelper() { if (active) f(); }
+ void deactivate() { active = false; }
+ bool active;
+ const Closure f;
+ };
+
+ template <typename Closure>
+ OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+ return OnScopeExitHelper<Closure>(f);
+ }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code) \
+ auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+ template<typename Ty>
+ std::unique_ptr<Ty> make_unique(Ty* ptr) {
+ return std::unique_ptr<Ty>(ptr);
+ }
+
+}
diff --git a/thirdparty/embree/common/sys/ref.h b/thirdparty/embree/common/sys/ref.h
new file mode 100644
index 0000000000..c2b56c1908
--- /dev/null
+++ b/thirdparty/embree/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+ struct NullTy {
+ };
+
+ extern MAYBE_UNUSED NullTy null;
+
+ class RefCount
+ {
+ public:
+ RefCount(int val = 0) : refCounter(val) {}
+ virtual ~RefCount() {};
+
+ virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+ virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+ private:
+ std::atomic<size_t> refCounter;
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reference to single object
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename Type>
+ class Ref
+ {
+ public:
+ Type* ptr;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline Ref() : ptr(nullptr) {}
+ __forceinline Ref(NullTy) : ptr(nullptr) {}
+ __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+ __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+ __forceinline Ref(Type* const input) : ptr(input)
+ {
+ if (ptr)
+ ptr->refInc();
+ }
+
+ __forceinline ~Ref()
+ {
+ if (ptr)
+ ptr->refDec();
+ }
+
+ __forceinline Ref& operator =(const Ref& input)
+ {
+ if (input.ptr)
+ input.ptr->refInc();
+ if (ptr)
+ ptr->refDec();
+ ptr = input.ptr;
+ return *this;
+ }
+
+ __forceinline Ref& operator =(Ref&& input)
+ {
+ if (ptr)
+ ptr->refDec();
+ ptr = input.ptr;
+ input.ptr = nullptr;
+ return *this;
+ }
+
+ __forceinline Ref& operator =(Type* const input)
+ {
+ if (input)
+ input->refInc();
+ if (ptr)
+ ptr->refDec();
+ ptr = input;
+ return *this;
+ }
+
+ __forceinline Ref& operator =(NullTy)
+ {
+ if (ptr)
+ ptr->refDec();
+ ptr = nullptr;
+ return *this;
+ }
+
+ __forceinline operator bool() const { return ptr != nullptr; }
+
+ __forceinline const Type& operator *() const { return *ptr; }
+ __forceinline Type& operator *() { return *ptr; }
+ __forceinline const Type* operator ->() const { return ptr; }
+ __forceinline Type* operator ->() { return ptr; }
+
+ template<typename TypeOut>
+ __forceinline Ref<TypeOut> cast() { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+ template<typename TypeOut>
+ __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+ template<typename TypeOut>
+ __forceinline Ref<TypeOut> dynamicCast() { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+ template<typename TypeOut>
+ __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+ };
+
+ template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr < b.ptr; }
+
+ template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy ) { return a.ptr == nullptr; }
+ template<typename Type> __forceinline bool operator ==(NullTy , const Ref<Type>& b) { return nullptr == b.ptr; }
+ template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr == b.ptr; }
+
+ template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy ) { return a.ptr != nullptr; }
+ template<typename Type> __forceinline bool operator !=(NullTy , const Ref<Type>& b) { return nullptr != b.ptr; }
+ template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr != b.ptr; }
+}
diff --git a/thirdparty/embree/common/sys/regression.cpp b/thirdparty/embree/common/sys/regression.cpp
new file mode 100644
index 0000000000..45315b1105
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.cpp
@@ -0,0 +1,30 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "regression.h"
+
+namespace embree
+{
+ /* registerRegressionTest is invoked from static initializers, thus
+ * we cannot have the regression_tests variable as global static
+ * variable due to issues with static variable initialization
+ * order. */
+ std::vector<RegressionTest*>& get_regression_tests()
+ {
+ static std::vector<RegressionTest*> regression_tests;
+ return regression_tests;
+ }
+
+ void registerRegressionTest(RegressionTest* test)
+ {
+ get_regression_tests().push_back(test);
+ }
+
+ RegressionTest* getRegressionTest(size_t index)
+ {
+ if (index >= get_regression_tests().size())
+ return nullptr;
+
+ return get_regression_tests()[index];
+ }
+}
diff --git a/thirdparty/embree/common/sys/regression.h b/thirdparty/embree/common/sys/regression.h
new file mode 100644
index 0000000000..bb0bb94006
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.h
@@ -0,0 +1,25 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#include <vector>
+
+namespace embree
+{
+ /*! virtual interface for all regression tests */
+ struct RegressionTest
+ {
+ RegressionTest (std::string name) : name(name) {}
+ virtual bool run() = 0;
+ std::string name;
+ };
+
+ /*! registers a regression test */
+ void registerRegressionTest(RegressionTest* test);
+
+ /*! run all regression tests */
+ RegressionTest* getRegressionTest(size_t index);
+}
diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/string.cpp
new file mode 100644
index 0000000000..f42fdc8536
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+ char to_lower(char c) { return char(tolower(int(c))); }
+ char to_upper(char c) { return char(toupper(int(c))); }
+ std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+ std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+ Vec2f string_to_Vec2f ( std::string str )
+ {
+ size_t next = 0;
+ const float x = std::stof(str,&next); str = str.substr(next+1);
+ const float y = std::stof(str,&next);
+ return Vec2f(x,y);
+ }
+
+ Vec3f string_to_Vec3f ( std::string str )
+ {
+ size_t next = 0;
+ const float x = std::stof(str,&next); str = str.substr(next+1);
+ const float y = std::stof(str,&next); str = str.substr(next+1);
+ const float z = std::stof(str,&next);
+ return Vec3f(x,y,z);
+ }
+
+ Vec4f string_to_Vec4f ( std::string str )
+ {
+ size_t next = 0;
+ const float x = std::stof(str,&next); str = str.substr(next+1);
+ const float y = std::stof(str,&next); str = str.substr(next+1);
+ const float z = std::stof(str,&next); str = str.substr(next+1);
+ const float w = std::stof(str,&next);
+ return Vec4f(x,y,z,w);
+ }
+}
diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/string.h
new file mode 100644
index 0000000000..820076b21c
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+ class IOStreamStateRestorer
+ {
+ public:
+ IOStreamStateRestorer(std::ostream& iostream)
+ : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+ }
+
+ ~IOStreamStateRestorer() {
+ iostream.flags(flags);
+ iostream.precision(precision);
+ }
+
+ private:
+ std::ostream& iostream;
+ std::ios::fmtflags flags;
+ std::streamsize precision;
+ };
+
+ std::string toLowerCase(const std::string& s);
+ std::string toUpperCase(const std::string& s);
+
+ Vec2f string_to_Vec2f ( std::string str );
+ Vec3f string_to_Vec3f ( std::string str );
+ Vec4f string_to_Vec4f ( std::string str );
+}
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
new file mode 100644
index 0000000000..f1a59e511e
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -0,0 +1,656 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sysinfo.h"
+#include "intrinsics.h"
+#include "string.h"
+#include "ref.h"
+#if defined(__FREEBSD__)
+#include <sys/cpuset.h>
+#include <pthread_np.h>
+typedef cpuset_t cpu_set_t;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+ NullTy null;
+
+ std::string getPlatformName()
+ {
+#if defined(__LINUX__) && !defined(__64BIT__)
+ return "Linux (32bit)";
+#elif defined(__LINUX__) && defined(__64BIT__)
+ return "Linux (64bit)";
+#elif defined(__FREEBSD__) && !defined(__64BIT__)
+ return "FreeBSD (32bit)";
+#elif defined(__FREEBSD__) && defined(__64BIT__)
+ return "FreeBSD (64bit)";
+#elif defined(__CYGWIN__) && !defined(__64BIT__)
+ return "Cygwin (32bit)";
+#elif defined(__CYGWIN__) && defined(__64BIT__)
+ return "Cygwin (64bit)";
+#elif defined(__WIN32__) && !defined(__64BIT__)
+ return "Windows (32bit)";
+#elif defined(__WIN32__) && defined(__64BIT__)
+ return "Windows (64bit)";
+#elif defined(__MACOSX__) && !defined(__64BIT__)
+ return "Mac OS X (32bit)";
+#elif defined(__MACOSX__) && defined(__64BIT__)
+ return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && !defined(__64BIT__)
+ return "Unix (32bit)";
+#elif defined(__UNIX__) && defined(__64BIT__)
+ return "Unix (64bit)";
+#else
+ return "Unknown";
+#endif
+ }
+
+ std::string getCompilerName()
+ {
+#if defined(__INTEL_COMPILER)
+ int icc_mayor = __INTEL_COMPILER / 100 % 100;
+ int icc_minor = __INTEL_COMPILER % 100;
+ std::string version = "Intel Compiler ";
+ version += toString(icc_mayor);
+ version += "." + toString(icc_minor);
+#if defined(__INTEL_COMPILER_UPDATE)
+ version += "." + toString(__INTEL_COMPILER_UPDATE);
+#endif
+ return version;
+#elif defined(__clang__)
+ return "CLANG " __clang_version__;
+#elif defined (__GNUC__)
+ return "GCC " __VERSION__;
+#elif defined(_MSC_VER)
+ std::string version = toString(_MSC_FULL_VER);
+ version.insert(4,".");
+ version.insert(9,".");
+ version.insert(2,".");
+ return "Visual C++ Compiler " + version;
+#else
+ return "Unknown Compiler";
+#endif
+ }
+
+ std::string getCPUVendor()
+ {
+#if defined(__X86_ASM__)
+ int cpuinfo[4];
+ __cpuid (cpuinfo, 0);
+ int name[4];
+ name[0] = cpuinfo[1];
+ name[1] = cpuinfo[3];
+ name[2] = cpuinfo[2];
+ name[3] = 0;
+ return (char*)name;
+#elif defined(__ARM_NEON)
+ return "ARM";
+#else
+ return "Unknown";
+#endif
+ }
+
+ CPU getCPUModel()
+ {
+#if defined(__X86_ASM__)
+ if (getCPUVendor() != "GenuineIntel")
+ return CPU::UNKNOWN;
+
+ int out[4];
+ __cpuid(out, 0);
+ if (out[0] < 1) return CPU::UNKNOWN;
+ __cpuid(out, 1);
+
+ /* please see CPUID documentation for these formulas */
+ uint32_t family_ID = (out[0] >> 8) & 0x0F;
+ uint32_t extended_family_ID = (out[0] >> 20) & 0xFF;
+
+ uint32_t model_ID = (out[0] >> 4) & 0x0F;
+ uint32_t extended_model_ID = (out[0] >> 16) & 0x0F;
+
+ uint32_t DisplayFamily = family_ID;
+ if (family_ID == 0x0F)
+ DisplayFamily += extended_family_ID;
+
+ uint32_t DisplayModel = model_ID;
+ if (family_ID == 0x06 || family_ID == 0x0F)
+ DisplayModel += extended_model_ID << 4;
+
+ uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0);
+
+ // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel)
+ if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE;
+ if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE;
+ if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE;
+ if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE;
+ if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE;
+ if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE;
+ if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE;
+ if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE;
+ if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE;
+ if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE;
+ if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE;
+ if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE;
+ if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE;
+ if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL;
+ if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL;
+ if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL;
+ if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL;
+ if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL;
+ if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL;
+ if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL;
+ if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL;
+ if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE;
+ if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE;
+ if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE;
+ if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE;
+ if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE;
+ if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM;
+ if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2;
+ if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2;
+ if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1;
+
+ if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
+ if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
+
+#elif defined(__ARM_NEON)
+ return CPU::ARM;
+#endif
+
+ return CPU::UNKNOWN;
+ }
+
+ std::string stringOfCPUModel(CPU model)
+ {
+ switch (model) {
+ case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake";
+ case CPU::CORE_ICE_LAKE : return "Core Ice Lake";
+ case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake";
+ case CPU::CORE_COMET_LAKE : return "Core Comet Lake";
+ case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake";
+ case CPU::CORE_KABY_LAKE : return "Core Kaby Lake";
+ case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake";
+ case CPU::CORE_SKY_LAKE : return "Core Sky Lake";
+ case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill";
+ case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing";
+ case CPU::XEON_BROADWELL : return "Xeon Broadwell";
+ case CPU::CORE_BROADWELL : return "Core Broadwell";
+ case CPU::XEON_HASWELL : return "Xeon Haswell";
+ case CPU::CORE_HASWELL : return "Core Haswell";
+ case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge";
+ case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge";
+ case CPU::SANDY_BRIDGE : return "Sandy Bridge";
+ case CPU::NEHALEM : return "Nehalem";
+ case CPU::CORE2 : return "Core2";
+ case CPU::CORE1 : return "Core";
+ case CPU::ARM : return "ARM";
+ case CPU::UNKNOWN : return "Unknown CPU";
+ }
+ return "Unknown CPU (error)";
+ }
+
+#if defined(__X86_ASM__)
+ /* constants to access destination registers of CPUID instruction */
+ static const int EAX = 0;
+ static const int EBX = 1;
+ static const int ECX = 2;
+ static const int EDX = 3;
+
+ /* cpuid[eax=1].ecx */
+ static const int CPU_FEATURE_BIT_SSE3 = 1 << 0;
+ static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9;
+ static const int CPU_FEATURE_BIT_FMA3 = 1 << 12;
+ static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19;
+ static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20;
+ //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22;
+ static const int CPU_FEATURE_BIT_POPCNT = 1 << 23;
+ //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26;
+ static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27;
+ static const int CPU_FEATURE_BIT_AVX = 1 << 28;
+ static const int CPU_FEATURE_BIT_F16C = 1 << 29;
+ static const int CPU_FEATURE_BIT_RDRAND = 1 << 30;
+
+ /* cpuid[eax=1].edx */
+ static const int CPU_FEATURE_BIT_SSE = 1 << 25;
+ static const int CPU_FEATURE_BIT_SSE2 = 1 << 26;
+
+ /* cpuid[eax=0x80000001].ecx */
+ static const int CPU_FEATURE_BIT_LZCNT = 1 << 5;
+
+ /* cpuid[eax=7,ecx=0].ebx */
+ static const int CPU_FEATURE_BIT_BMI1 = 1 << 3;
+ static const int CPU_FEATURE_BIT_AVX2 = 1 << 5;
+ static const int CPU_FEATURE_BIT_BMI2 = 1 << 8;
+ static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation)
+ static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions)
+ static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions)
+ static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions)
+ static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions)
+ static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions)
+ static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions)
+ static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions)
+
+ /* cpuid[eax=7,ecx=0].ecx */
+ static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions)
+#endif
+
+#if defined(__X86_ASM__)
+ __noinline int64_t get_xcr0()
+ {
+// -- GODOT start --
+#if defined (__WIN32__) && !defined (__MINGW32__)
+// -- GODOT end --
+ int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+ xcr0 = _xgetbv(0);
+ return xcr0;
+#else
+ int xcr0 = 0;
+ __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+ return xcr0;
+#endif
+ }
+#endif
+
+ int getCPUFeatures()
+ {
+#if defined(__X86_ASM__)
+ /* cache CPU features access */
+ static int cpu_features = 0;
+ if (cpu_features)
+ return cpu_features;
+
+ /* get number of CPUID leaves */
+ int cpuid_leaf0[4];
+ __cpuid(cpuid_leaf0, 0x00000000);
+ unsigned nIds = cpuid_leaf0[EAX];
+
+ /* get number of extended CPUID leaves */
+ int cpuid_leafe[4];
+ __cpuid(cpuid_leafe, 0x80000000);
+ unsigned nExIds = cpuid_leafe[EAX];
+
+ /* get CPUID leaves for EAX = 1,7, and 0x80000001 */
+ int cpuid_leaf_1[4] = { 0,0,0,0 };
+ int cpuid_leaf_7[4] = { 0,0,0,0 };
+ int cpuid_leaf_e1[4] = { 0,0,0,0 };
+ if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001);
+#if _WIN32
+#if _MSC_VER && (_MSC_FULL_VER < 160040219)
+#else
+ if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0);
+#endif
+#else
+ if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0);
+#endif
+ if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001);
+
+ /* detect if OS saves XMM, YMM, and ZMM states */
+ bool xmm_enabled = true;
+ bool ymm_enabled = false;
+ bool zmm_enabled = false;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) {
+ int64_t xcr0 = get_xcr0();
+ xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */
+ ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */
+ zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */
+ }
+ if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
+ if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
+ if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
+
+ if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE;
+ if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
+
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2;
+ if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3;
+ if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT;
+ if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1;
+ if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2;
+
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
+ if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL;
+ if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
+
+ return cpu_features;
+#elif defined(__ARM_NEON)
+ /* emulated features with sse2neon */
+ return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED;
+#else
+ /* Unknown CPU. */
+ return 0;
+#endif
+ }
+
+ std::string stringOfCPUFeatures(int features)
+ {
+ std::string str;
+ if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM ";
+ if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM ";
+ if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM ";
+ if (features & CPU_FEATURE_SSE ) str += "SSE ";
+ if (features & CPU_FEATURE_SSE2 ) str += "SSE2 ";
+ if (features & CPU_FEATURE_SSE3 ) str += "SSE3 ";
+ if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 ";
+ if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 ";
+ if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 ";
+ if (features & CPU_FEATURE_POPCNT) str += "POPCNT ";
+ if (features & CPU_FEATURE_AVX ) str += "AVX ";
+ if (features & CPU_FEATURE_F16C ) str += "F16C ";
+ if (features & CPU_FEATURE_RDRAND) str += "RDRAND ";
+ if (features & CPU_FEATURE_AVX2 ) str += "AVX2 ";
+ if (features & CPU_FEATURE_FMA3 ) str += "FMA3 ";
+ if (features & CPU_FEATURE_LZCNT ) str += "LZCNT ";
+ if (features & CPU_FEATURE_BMI1 ) str += "BMI1 ";
+ if (features & CPU_FEATURE_BMI2 ) str += "BMI2 ";
+ if (features & CPU_FEATURE_AVX512F) str += "AVX512F ";
+ if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ ";
+ if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF ";
+ if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER ";
+ if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD ";
+ if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW ";
+ if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
+ if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
+ if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+ return str;
+ }
+
+ std::string stringOfISA (int isa)
+ {
+ if (isa == SSE) return "SSE";
+ if (isa == SSE2) return "SSE2";
+ if (isa == SSE3) return "SSE3";
+ if (isa == SSSE3) return "SSSE3";
+ if (isa == SSE41) return "SSE4.1";
+ if (isa == SSE42) return "SSE4.2";
+ if (isa == AVX) return "AVX";
+ if (isa == AVX2) return "AVX2";
+ if (isa == AVX512) return "AVX512";
+ return "UNKNOWN";
+ }
+
+ bool hasISA(int features, int isa) {
+ return (features & isa) == isa;
+ }
+
+ std::string supportedTargetList (int features)
+ {
+ std::string v;
+ if (hasISA(features,SSE)) v += "SSE ";
+ if (hasISA(features,SSE2)) v += "SSE2 ";
+ if (hasISA(features,SSE3)) v += "SSE3 ";
+ if (hasISA(features,SSSE3)) v += "SSSE3 ";
+ if (hasISA(features,SSE41)) v += "SSE4.1 ";
+ if (hasISA(features,SSE42)) v += "SSE4.2 ";
+ if (hasISA(features,AVX)) v += "AVX ";
+ if (hasISA(features,AVXI)) v += "AVXI ";
+ if (hasISA(features,AVX2)) v += "AVX2 ";
+ if (hasISA(features,AVX512)) v += "AVX512 ";
+ return v;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <psapi.h>
+
+namespace embree
+{
+ std::string getExecutableFileName() {
+ char filename[1024];
+ if (!GetModuleFileName(nullptr, filename, sizeof(filename)))
+ return std::string();
+ return std::string(filename);
+ }
+
+ unsigned int getNumberOfLogicalThreads()
+ {
+ static int nThreads = -1;
+ if (nThreads != -1) return nThreads;
+
+ typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+ typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+ HMODULE hlib = LoadLibrary("Kernel32");
+ GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+ GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount");
+
+ if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount)
+ {
+ int groups = pGetActiveProcessorGroupCount();
+ int totalProcessors = 0;
+ for (int i = 0; i < groups; i++)
+ totalProcessors += pGetActiveProcessorCount(i);
+ nThreads = totalProcessors;
+ }
+ else
+ {
+ SYSTEM_INFO sysinfo;
+ GetSystemInfo(&sysinfo);
+ nThreads = sysinfo.dwNumberOfProcessors;
+ }
+ assert(nThreads);
+ return nThreads;
+ }
+
+ int getTerminalWidth()
+ {
+ HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
+ if (handle == INVALID_HANDLE_VALUE) return 80;
+ CONSOLE_SCREEN_BUFFER_INFO info;
+ memset(&info,0,sizeof(info));
+ GetConsoleScreenBufferInfo(handle, &info);
+ return info.dwSize.X;
+ }
+
+ double getSeconds()
+ {
+ LARGE_INTEGER freq, val;
+ QueryPerformanceFrequency(&freq);
+ QueryPerformanceCounter(&val);
+ return (double)val.QuadPart / (double)freq.QuadPart;
+ }
+
+ void sleepSeconds(double t) {
+ Sleep(DWORD(1000.0*t));
+ }
+
+ size_t getVirtualMemoryBytes()
+ {
+ PROCESS_MEMORY_COUNTERS info;
+ GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+ return (size_t)info.QuotaPeakPagedPoolUsage;
+ }
+
+ size_t getResidentMemoryBytes()
+ {
+ PROCESS_MEMORY_COUNTERS info;
+ GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+ return (size_t)info.WorkingSetSize;
+ }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <stdio.h>
+#include <unistd.h>
+
+namespace embree
+{
+ std::string getExecutableFileName()
+ {
+ std::string pid = "/proc/" + toString(getpid()) + "/exe";
+ char buf[4096];
+ memset(buf,0,sizeof(buf));
+ if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1)
+ return std::string();
+ return std::string(buf);
+ }
+
+ size_t getVirtualMemoryBytes()
+ {
+ size_t virt, resident, shared;
+ std::ifstream buffer("/proc/self/statm");
+ buffer >> virt >> resident >> shared;
+ return virt*sysconf(_SC_PAGE_SIZE);
+ }
+
+ size_t getResidentMemoryBytes()
+ {
+ size_t virt, resident, shared;
+ std::ifstream buffer("/proc/self/statm");
+ buffer >> virt >> resident >> shared;
+ return resident*sysconf(_SC_PAGE_SIZE);
+ }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__FreeBSD__)
+
+#include <sys/sysctl.h>
+
+namespace embree
+{
+ std::string getExecutableFileName()
+ {
+ const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+ char buf[4096];
+ memset(buf,0,sizeof(buf));
+ size_t len = sizeof(buf)-1;
+ if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1)
+ return std::string();
+ return std::string(buf);
+ }
+
+ size_t getVirtualMemoryBytes() {
+ return 0;
+ }
+
+ size_t getResidentMemoryBytes() {
+ return 0;
+ }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Mac OS X Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach-o/dyld.h>
+
+namespace embree
+{
+ std::string getExecutableFileName()
+ {
+ char buf[4096];
+ uint32_t size = sizeof(buf);
+ if (_NSGetExecutablePath(buf, &size) != 0)
+ return std::string();
+ return std::string(buf);
+ }
+
+ size_t getVirtualMemoryBytes() {
+ return 0;
+ }
+
+ size_t getResidentMemoryBytes() {
+ return 0;
+ }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+namespace embree
+{
+ unsigned int getNumberOfLogicalThreads()
+ {
+ static int nThreads = -1;
+ if (nThreads != -1) return nThreads;
+
+// -- GODOT start --
+// #if defined(__MACOSX__)
+#if defined(__MACOSX__) || defined(__ANDROID__)
+// -- GODOT end --
+ nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+ assert(nThreads);
+#else
+ cpu_set_t set;
+ if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+ nThreads = CPU_COUNT(&set);
+#endif
+
+ assert(nThreads);
+ return nThreads;
+ }
+
+ int getTerminalWidth()
+ {
+ struct winsize info;
+ if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
+ return info.ws_col;
+ }
+
+ double getSeconds() {
+ struct timeval tp; gettimeofday(&tp,nullptr);
+ return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+ }
+
+ void sleepSeconds(double t) {
+ usleep(1000000.0*t);
+ }
+}
+#endif
+
diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h
new file mode 100644
index 0000000000..72351d12e4
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.h
@@ -0,0 +1,178 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define CACHELINE_SIZE 64
+
+#if !defined(PAGE_SIZE)
+ #define PAGE_SIZE 4096
+#endif
+
+#define PAGE_SIZE_2M (2*1024*1024)
+#define PAGE_SIZE_4K (4*1024)
+
+#include "platform.h"
+
+/* define isa namespace and ISA bitvector */
+#if defined (__AVX512VL__)
+# define isa avx512
+# define ISA AVX512
+# define ISA_STR "AVX512"
+#elif defined (__AVX2__)
+# define isa avx2
+# define ISA AVX2
+# define ISA_STR "AVX2"
+#elif defined(__AVXI__)
+# define isa avxi
+# define ISA AVXI
+# define ISA_STR "AVXI"
+#elif defined(__AVX__)
+# define isa avx
+# define ISA AVX
+# define ISA_STR "AVX"
+#elif defined (__SSE4_2__)
+# define isa sse42
+# define ISA SSE42
+# define ISA_STR "SSE4.2"
+//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11
+//# define isa sse41
+//# define ISA SSE41
+//# define ISA_STR "SSE4.1"
+//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC
+//# define isa ssse3
+//# define ISA SSSE3
+//# define ISA_STR "SSSE3"
+//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang
+//# define isa sse3
+//# define ISA SSE3
+//# define ISA_STR "SSE3"
+#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__)
+# define isa sse2
+# define ISA SSE2
+# define ISA_STR "SSE2"
+#elif defined(__SSE__)
+# define isa sse
+# define ISA SSE
+# define ISA_STR "SSE"
+#else
+#error Unknown ISA
+#endif
+
+namespace embree
+{
+ enum class CPU
+ {
+ XEON_ICE_LAKE,
+ CORE_ICE_LAKE,
+ CORE_TIGER_LAKE,
+ CORE_COMET_LAKE,
+ CORE_CANNON_LAKE,
+ CORE_KABY_LAKE,
+ XEON_SKY_LAKE,
+ CORE_SKY_LAKE,
+ XEON_PHI_KNIGHTS_MILL,
+ XEON_PHI_KNIGHTS_LANDING,
+ XEON_BROADWELL,
+ CORE_BROADWELL,
+ XEON_HASWELL,
+ CORE_HASWELL,
+ XEON_IVY_BRIDGE,
+ CORE_IVY_BRIDGE,
+ SANDY_BRIDGE,
+ NEHALEM,
+ CORE2,
+ CORE1,
+ ARM,
+ UNKNOWN,
+ };
+
+ /*! get the full path to the running executable */
+ std::string getExecutableFileName();
+
+ /*! return platform name */
+ std::string getPlatformName();
+
+ /*! get the full name of the compiler */
+ std::string getCompilerName();
+
+ /*! return the name of the CPU */
+ std::string getCPUVendor();
+
+ /*! get microprocessor model */
+ CPU getCPUModel();
+
+ /*! converts CPU model into string */
+ std::string stringOfCPUModel(CPU model);
+
+ /*! CPU features */
+ static const int CPU_FEATURE_SSE = 1 << 0;
+ static const int CPU_FEATURE_SSE2 = 1 << 1;
+ static const int CPU_FEATURE_SSE3 = 1 << 2;
+ static const int CPU_FEATURE_SSSE3 = 1 << 3;
+ static const int CPU_FEATURE_SSE41 = 1 << 4;
+ static const int CPU_FEATURE_SSE42 = 1 << 5;
+ static const int CPU_FEATURE_POPCNT = 1 << 6;
+ static const int CPU_FEATURE_AVX = 1 << 7;
+ static const int CPU_FEATURE_F16C = 1 << 8;
+ static const int CPU_FEATURE_RDRAND = 1 << 9;
+ static const int CPU_FEATURE_AVX2 = 1 << 10;
+ static const int CPU_FEATURE_FMA3 = 1 << 11;
+ static const int CPU_FEATURE_LZCNT = 1 << 12;
+ static const int CPU_FEATURE_BMI1 = 1 << 13;
+ static const int CPU_FEATURE_BMI2 = 1 << 14;
+ static const int CPU_FEATURE_AVX512F = 1 << 16;
+ static const int CPU_FEATURE_AVX512DQ = 1 << 17;
+ static const int CPU_FEATURE_AVX512PF = 1 << 18;
+ static const int CPU_FEATURE_AVX512ER = 1 << 19;
+ static const int CPU_FEATURE_AVX512CD = 1 << 20;
+ static const int CPU_FEATURE_AVX512BW = 1 << 21;
+ static const int CPU_FEATURE_AVX512VL = 1 << 22;
+ static const int CPU_FEATURE_AVX512IFMA = 1 << 23;
+ static const int CPU_FEATURE_AVX512VBMI = 1 << 24;
+ static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
+ static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
+ static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
+
+ /*! get CPU features */
+ int getCPUFeatures();
+
+ /*! convert CPU features into a string */
+ std::string stringOfCPUFeatures(int features);
+
+ /*! creates a string of all supported targets that are supported */
+ std::string supportedTargetList (int isa);
+
+ /*! ISAs */
+ static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED;
+ static const int SSE2 = SSE | CPU_FEATURE_SSE2;
+ static const int SSE3 = SSE2 | CPU_FEATURE_SSE3;
+ static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3;
+ static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41;
+ static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT;
+ static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED;
+ static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
+ static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
+ static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+
+ /*! converts ISA bitvector into a string */
+ std::string stringOfISA(int features);
+
+ /*! return the number of logical threads of the system */
+ unsigned int getNumberOfLogicalThreads();
+
+ /*! returns the size of the terminal window in characters */
+ int getTerminalWidth();
+
+ /*! returns performance counter in seconds */
+ double getSeconds();
+
+ /*! sleeps the specified number of seconds */
+ void sleepSeconds(double t);
+
+ /*! returns virtual address space occupied by process */
+ size_t getVirtualMemoryBytes();
+
+ /*! returns resident memory required by process */
+ size_t getResidentMemoryBytes();
+}
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
new file mode 100644
index 0000000000..f4014be89b
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -0,0 +1,474 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "thread.h"
+#include "sysinfo.h"
+#include "string.h"
+
+#include <iostream>
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <xmmintrin.h>
+#endif
+
+#if defined(PTHREADS_WIN32)
+#pragma comment (lib, "pthreadVC.lib")
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+ /*! set the affinity of a given thread */
+ void setAffinity(HANDLE thread, ssize_t affinity)
+ {
+ typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+ typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+ typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
+ typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
+ HMODULE hlib = LoadLibrary("Kernel32");
+ GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+ GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
+ SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
+ SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
+ if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
+ {
+ int groups = pGetActiveProcessorGroupCount();
+ int totalProcessors = 0, group = 0, number = 0;
+ for (int i = 0; i<groups; i++) {
+ int processors = pGetActiveProcessorCount(i);
+ if (totalProcessors + processors > affinity) {
+ group = i;
+ number = (int)affinity - totalProcessors;
+ break;
+ }
+ totalProcessors += processors;
+ }
+
+ GROUP_AFFINITY groupAffinity;
+ groupAffinity.Group = (WORD)group;
+ groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
+ groupAffinity.Reserved[0] = 0;
+ groupAffinity.Reserved[1] = 0;
+ groupAffinity.Reserved[2] = 0;
+ if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
+ WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
+
+ PROCESSOR_NUMBER processorNumber;
+ processorNumber.Group = group;
+ processorNumber.Number = number;
+ processorNumber.Reserved = 0;
+ if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
+ WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
+ }
+ else
+ {
+ if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
+ WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
+ if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
+ WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
+ }
+ }
+
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity) {
+ setAffinity(GetCurrentThread(), affinity);
+ }
+
+ struct ThreadStartupData
+ {
+ public:
+ ThreadStartupData (thread_func f, void* arg)
+ : f(f), arg(arg) {}
+ public:
+ thread_func f;
+ void* arg;
+ };
+
+ DWORD WINAPI threadStartup(LPVOID ptr)
+ {
+ ThreadStartupData* parg = (ThreadStartupData*) ptr;
+ _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+ parg->f(parg->arg);
+ delete parg;
+ return 0;
+ }
+
+#if !defined(PTHREADS_WIN32)
+
+ /*! creates a hardware thread running on specific core */
+ thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+ {
+ HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
+ if (thread == nullptr) FATAL("CreateThread failed");
+ if (threadID >= 0) setAffinity(thread, threadID);
+ return thread_t(thread);
+ }
+
+ /*! the thread calling this function gets yielded */
+ void yield() {
+ SwitchToThread();
+ }
+
+ /*! waits until the given thread has terminated */
+ void join(thread_t tid) {
+ WaitForSingleObject(HANDLE(tid), INFINITE);
+ CloseHandle(HANDLE(tid));
+ }
+
+ /*! destroy a hardware thread by its handle */
+ void destroyThread(thread_t tid) {
+ TerminateThread(HANDLE(tid),0);
+ CloseHandle(HANDLE(tid));
+ }
+
+ /*! creates thread local storage */
+ tls_t createTls() {
+ return tls_t(size_t(TlsAlloc()));
+ }
+
+ /*! set the thread local storage pointer */
+ void setTls(tls_t tls, void* const ptr) {
+ TlsSetValue(DWORD(size_t(tls)), ptr);
+ }
+
+ /*! return the thread local storage pointer */
+ void* getTls(tls_t tls) {
+ return TlsGetValue(DWORD(size_t(tls)));
+ }
+
+ /*! destroys thread local storage identifier */
+ void destroyTls(tls_t tls) {
+ TlsFree(DWORD(size_t(tls)));
+ }
+#endif
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+namespace embree
+{
+ static MutexSys mutex;
+ static std::vector<size_t> threadIDs;
+
+ /* changes thread ID mapping such that we first fill up all thread on one core */
+ size_t mapThreadID(size_t threadID)
+ {
+ Lock<MutexSys> lock(mutex);
+
+ if (threadIDs.size() == 0)
+ {
+ /* parse thread/CPU topology */
+ for (size_t cpuID=0;;cpuID++)
+ {
+ std::fstream fs;
+ std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
+ fs.open (cpu.c_str(), std::fstream::in);
+ if (fs.fail()) break;
+
+ int i;
+ while (fs >> i)
+ {
+ if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
+ threadIDs.push_back(i);
+ if (fs.peek() == ',')
+ fs.ignore();
+ }
+ fs.close();
+ }
+
+#if 0
+ for (size_t i=0;i<threadIDs.size();i++)
+ std::cout << i << " -> " << threadIDs[i] << std::endl;
+#endif
+
+ /* verify the mapping and do not use it if the mapping has errors */
+ for (size_t i=0;i<threadIDs.size();i++) {
+ for (size_t j=0;j<threadIDs.size();j++) {
+ if (i != j && threadIDs[i] == threadIDs[j]) {
+ threadIDs.clear();
+ }
+ }
+ }
+ }
+
+ /* re-map threadIDs if mapping is available */
+ size_t ID = threadID;
+ if (threadID < threadIDs.size())
+ ID = threadIDs[threadID];
+
+ /* find correct thread to affinitize to */
+ cpu_set_t set;
+ if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+ {
+ for (int i=0, j=0; i<CPU_SETSIZE; i++)
+ {
+ if (!CPU_ISSET(i,&set)) continue;
+
+ if (j == ID) {
+ ID = i;
+ break;
+ }
+ j++;
+ }
+ }
+
+ return ID;
+ }
+
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity)
+ {
+ cpu_set_t cset;
+ CPU_ZERO(&cset);
+ size_t threadID = mapThreadID(affinity);
+ CPU_SET(threadID, &cset);
+
+ pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+ }
+}
+#endif
+
+// -- GODOT start --
+////////////////////////////////////////////////////////////////////////////////
+/// Android Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__ANDROID__)
+
+namespace embree
+{
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity)
+ {
+ cpu_set_t cset;
+ CPU_ZERO(&cset);
+ CPU_SET(affinity, &cset);
+
+ sched_setaffinity(0, sizeof(cset), &cset);
+ }
+}
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FreeBSD__)
+
+#include <pthread_np.h>
+
+namespace embree
+{
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity)
+ {
+ cpuset_t cset;
+ CPU_ZERO(&cset);
+ CPU_SET(affinity, &cset);
+
+ pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+ }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// MacOSX Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
+#include <mach/mach_init.h>
+
+namespace embree
+{
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity)
+ {
+#if !defined(__ARM_NEON) // affinity seems not supported on M1 chip
+
+ thread_affinity_policy ap;
+ ap.affinity_tag = affinity;
+ if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
+ WARNING("setting thread affinity failed"); // on purpose only a warning
+
+#endif
+ }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+
+#include <pthread.h>
+#include <sched.h>
+
+#if defined(__USE_NUMA__)
+#include <numa.h>
+#endif
+
+namespace embree
+{
+ struct ThreadStartupData
+ {
+ public:
+ ThreadStartupData (thread_func f, void* arg, int affinity)
+ : f(f), arg(arg), affinity(affinity) {}
+ public:
+ thread_func f;
+ void* arg;
+ ssize_t affinity;
+ };
+
+ static void* threadStartup(ThreadStartupData* parg)
+ {
+ _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+
+ /*! Mac OS X does not support setting affinity at thread creation time */
+#if defined(__MACOSX__)
+ if (parg->affinity >= 0)
+ setAffinity(parg->affinity);
+#endif
+
+ parg->f(parg->arg);
+ delete parg;
+ return nullptr;
+ }
+
+ /*! creates a hardware thread running on specific core */
+ thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+ {
+ /* set stack size */
+ pthread_attr_t attr;
+ pthread_attr_init(&attr);
+ if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
+
+ /* create thread */
+ pthread_t* tid = new pthread_t;
+ if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
+ pthread_attr_destroy(&attr);
+ delete tid;
+ FATAL("pthread_create failed");
+ }
+ pthread_attr_destroy(&attr);
+
+ /* set affinity */
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+ if (threadID >= 0) {
+ cpu_set_t cset;
+ CPU_ZERO(&cset);
+ threadID = mapThreadID(threadID);
+ CPU_SET(threadID, &cset);
+ pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+ }
+#elif defined(__FreeBSD__)
+ if (threadID >= 0) {
+ cpuset_t cset;
+ CPU_ZERO(&cset);
+ CPU_SET(threadID, &cset);
+ pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+ }
+// -- GODOT start --
+#elif defined(__ANDROID__)
+ if (threadID >= 0) {
+ cpu_set_t cset;
+ CPU_ZERO(&cset);
+ CPU_SET(threadID, &cset);
+ sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
+ }
+#endif
+// -- GODOT end --
+
+ return thread_t(tid);
+ }
+
+ /*! the thread calling this function gets yielded */
+ void yield() {
+ sched_yield();
+ }
+
+ /*! waits until the given thread has terminated */
+ void join(thread_t tid) {
+ if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
+ FATAL("pthread_join failed");
+ delete (pthread_t*)tid;
+ }
+
+ /*! destroy a hardware thread by its handle */
+ void destroyThread(thread_t tid) {
+// -- GODOT start --
+#if defined(__ANDROID__)
+ FATAL("Can't destroy threads on Android.");
+#else
+ pthread_cancel(*(pthread_t*)tid);
+ delete (pthread_t*)tid;
+#endif
+// -- GODOT end --
+ }
+
+ /*! creates thread local storage */
+ tls_t createTls()
+ {
+ pthread_key_t* key = new pthread_key_t;
+ if (pthread_key_create(key,nullptr) != 0) {
+ delete key;
+ FATAL("pthread_key_create failed");
+ }
+
+ return tls_t(key);
+ }
+
+ /*! return the thread local storage pointer */
+ void* getTls(tls_t tls)
+ {
+ assert(tls);
+ return pthread_getspecific(*(pthread_key_t*)tls);
+ }
+
+ /*! set the thread local storage pointer */
+ void setTls(tls_t tls, void* const ptr)
+ {
+ assert(tls);
+ if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
+ FATAL("pthread_setspecific failed");
+ }
+
+ /*! destroys thread local storage identifier */
+ void destroyTls(tls_t tls)
+ {
+ assert(tls);
+ if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
+ FATAL("pthread_key_delete failed");
+ delete (pthread_key_t*)tls;
+ }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h
new file mode 100644
index 0000000000..92a10d5c5d
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "mutex.h"
+#include "alloc.h"
+#include "vector.h"
+#include <vector>
+
+namespace embree
+{
+ /*! type for thread */
+ typedef struct opaque_thread_t* thread_t;
+
+ /*! signature of thread start function */
+ typedef void (*thread_func)(void*);
+
+ /*! creates a hardware thread running on specific logical thread */
+ thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1);
+
+ /*! set affinity of the calling thread */
+ void setAffinity(ssize_t affinity);
+
+ /*! the thread calling this function gets yielded */
+ void yield();
+
+ /*! waits until the given thread has terminated */
+ void join(thread_t tid);
+
+ /*! destroy handle of a thread */
+ void destroyThread(thread_t tid);
+
+ /*! type for handle to thread local storage */
+ typedef struct opaque_tls_t* tls_t;
+
+ /*! creates thread local storage */
+ tls_t createTls();
+
+ /*! set the thread local storage pointer */
+ void setTls(tls_t tls, void* const ptr);
+
+ /*! return the thread local storage pointer */
+ void* getTls(tls_t tls);
+
+ /*! destroys thread local storage identifier */
+ void destroyTls(tls_t tls);
+}
diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h
new file mode 100644
index 0000000000..f832626789
--- /dev/null
+++ b/thirdparty/embree/common/sys/vector.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "alloc.h"
+#include <algorithm>
+
+namespace embree
+{
+ template<typename T, typename allocator>
+ class vector_t
+ {
+ public:
+ typedef T value_type;
+ typedef T* iterator;
+ typedef const T* const_iterator;
+
+ __forceinline vector_t ()
+ : size_active(0), size_alloced(0), items(nullptr) {}
+
+ __forceinline explicit vector_t (size_t sz)
+ : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+
+ template<typename M>
+ __forceinline explicit vector_t (M alloc, size_t sz)
+ : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+
+ __forceinline ~vector_t() {
+ clear();
+ }
+
+ __forceinline vector_t (const vector_t& other)
+ {
+ size_active = other.size_active;
+ size_alloced = other.size_alloced;
+ items = alloc.allocate(size_alloced);
+ for (size_t i=0; i<size_active; i++)
+ ::new (&items[i]) value_type(other.items[i]);
+ }
+
+ __forceinline vector_t (vector_t&& other)
+ : alloc(std::move(other.alloc))
+ {
+ size_active = other.size_active; other.size_active = 0;
+ size_alloced = other.size_alloced; other.size_alloced = 0;
+ items = other.items; other.items = nullptr;
+ }
+
+ __forceinline vector_t& operator=(const vector_t& other)
+ {
+ resize(other.size_active);
+ for (size_t i=0; i<size_active; i++)
+ items[i] = value_type(other.items[i]);
+ return *this;
+ }
+
+ __forceinline vector_t& operator=(vector_t&& other)
+ {
+ clear();
+ alloc = std::move(other.alloc);
+ size_active = other.size_active; other.size_active = 0;
+ size_alloced = other.size_alloced; other.size_alloced = 0;
+ items = other.items; other.items = nullptr;
+ return *this;
+ }
+
+ /********************** Iterators ****************************/
+
+ __forceinline iterator begin() { return items; };
+ __forceinline const_iterator begin() const { return items; };
+
+ __forceinline iterator end () { return items+size_active; };
+ __forceinline const_iterator end () const { return items+size_active; };
+
+
+ /********************** Capacity ****************************/
+
+ __forceinline bool empty () const { return size_active == 0; }
+ __forceinline size_t size () const { return size_active; }
+ __forceinline size_t capacity () const { return size_alloced; }
+
+
+ __forceinline void resize(size_t new_size) {
+ internal_resize(new_size,internal_grow_size(new_size));
+ }
+
+ __forceinline void reserve(size_t new_alloced)
+ {
+ /* do nothing if container already large enough */
+ if (new_alloced <= size_alloced)
+ return;
+
+ /* resize exact otherwise */
+ internal_resize(size_active,new_alloced);
+ }
+
+ __forceinline void shrink_to_fit() {
+ internal_resize(size_active,size_active);
+ }
+
+ /******************** Element access **************************/
+
+ __forceinline T& operator[](size_t i) { assert(i < size_active); return items[i]; }
+ __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; }
+
+ __forceinline T& at(size_t i) { assert(i < size_active); return items[i]; }
+ __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; }
+
+ __forceinline T& front() const { assert(size_active > 0); return items[0]; };
+ __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; };
+
+ __forceinline T* data() { return items; };
+ __forceinline const T* data() const { return items; };
+
+
+ /******************** Modifiers **************************/
+
+ __forceinline void push_back(const T& nt)
+ {
+ const T v = nt; // need local copy as input reference could point to this vector
+ internal_resize(size_active,internal_grow_size(size_active+1));
+ ::new (&items[size_active++]) T(v);
+ }
+
+ __forceinline void pop_back()
+ {
+ assert(!empty());
+ size_active--;
+ alloc.destroy(&items[size_active]);
+ }
+
+ __forceinline void clear()
+ {
+ /* destroy elements */
+ for (size_t i=0; i<size_active; i++)
+ alloc.destroy(&items[i]);
+
+ /* free memory */
+ alloc.deallocate(items,size_alloced);
+ items = nullptr;
+ size_active = size_alloced = 0;
+ }
+
+ /******************** Comparisons **************************/
+
+ friend bool operator== (const vector_t& a, const vector_t& b)
+ {
+ if (a.size() != b.size()) return false;
+ for (size_t i=0; i<a.size(); i++)
+ if (a[i] != b[i])
+ return false;
+ return true;
+ }
+
+ friend bool operator!= (const vector_t& a, const vector_t& b) {
+ return !(a==b);
+ }
+
+ private:
+
+ __forceinline void internal_resize_init(size_t new_active)
+ {
+ assert(size_active == 0);
+ assert(size_alloced == 0);
+ assert(items == nullptr);
+ if (new_active == 0) return;
+ items = alloc.allocate(new_active);
+ for (size_t i=0; i<new_active; i++) ::new (&items[i]) T();
+ size_active = new_active;
+ size_alloced = new_active;
+ }
+
+ __forceinline void internal_resize(size_t new_active, size_t new_alloced)
+ {
+ assert(new_active <= new_alloced);
+
+ /* destroy elements */
+ if (new_active < size_active)
+ {
+ for (size_t i=new_active; i<size_active; i++)
+ alloc.destroy(&items[i]);
+ size_active = new_active;
+ }
+
+ /* only reallocate if necessary */
+ if (new_alloced == size_alloced) {
+ for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T;
+ size_active = new_active;
+ return;
+ }
+
+ /* reallocate and copy items */
+ T* old_items = items;
+ items = alloc.allocate(new_alloced);
+ for (size_t i=0; i<size_active; i++) {
+ ::new (&items[i]) T(std::move(old_items[i]));
+ alloc.destroy(&old_items[i]);
+ }
+
+ for (size_t i=size_active; i<new_active; i++) {
+ ::new (&items[i]) T;
+ }
+
+ alloc.deallocate(old_items,size_alloced);
+ size_active = new_active;
+ size_alloced = new_alloced;
+ }
+
+ __forceinline size_t internal_grow_size(size_t new_alloced)
+ {
+ /* do nothing if container already large enough */
+ if (new_alloced <= size_alloced)
+ return size_alloced;
+
+ /* resize to next power of 2 otherwise */
+ size_t new_size_alloced = size_alloced;
+ while (new_size_alloced < new_alloced) {
+ new_size_alloced = std::max(size_t(1),2*new_size_alloced);
+ }
+ return new_size_alloced;
+ }
+
+ private:
+ allocator alloc;
+ size_t size_active; // number of valid items
+ size_t size_alloced; // number of items allocated
+ T* items; // data array
+ };
+
+ /*! vector class that performs standard allocations */
+ template<typename T>
+ using vector = vector_t<T,std::allocator<T>>;
+
+ /*! vector class that performs aligned allocations */
+ template<typename T>
+ using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
+
+ /*! vector class that performs OS allocations */
+ template<typename T>
+ using ovector = vector_t<T,os_allocator<T> >;
+}
diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..8f3dd87689
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskscheduler.h
@@ -0,0 +1,15 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+# include "taskschedulerinternal.h"
+#elif defined(TASKING_TBB)
+# include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+# include "taskschedulerppl.h"
+#else
+# error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ad438588a3
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,420 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+ RTC_NAMESPACE_BEGIN
+
+ static MutexSys g_mutex;
+ size_t TaskScheduler::g_numThreads = 0;
+ __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+ std::vector<Ref<TaskScheduler>> g_instance_vector;
+ __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+ TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+ template<typename Predicate, typename Body>
+ __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+ {
+ while (true)
+ {
+ /*! some rounds that yield */
+ for (size_t i=0; i<32; i++)
+ {
+ /*! some spinning rounds */
+ const size_t threadCount = thread.threadCount();
+ for (size_t j=0; j<1024; j+=threadCount)
+ {
+ if (!pred()) return;
+ if (thread.scheduler->steal_from_other_threads(thread)) {
+ i=j=0;
+ body();
+ }
+ }
+ yield();
+ }
+ }
+ }
+
+ /*! run this task */
+ void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+ {
+ /* try to run if not already stolen */
+ if (try_switch_state(INITIALIZED,DONE))
+ {
+ Task* prevTask = thread.task;
+ thread.task = this;
+ // -- GODOT start --
+ // try {
+ // if (thread.scheduler->cancellingException == nullptr)
+ closure->execute();
+ // } catch (...) {
+ // if (thread.scheduler->cancellingException == nullptr)
+ // thread.scheduler->cancellingException = std::current_exception();
+ // }
+ // -- GODOT end --
+ thread.task = prevTask;
+ add_dependencies(-1);
+ }
+
+ /* steal until all dependencies have completed */
+ steal_loop(thread,
+ [&] () { return dependencies>0; },
+ [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+ /* now signal our parent task that we are finished */
+ if (parent)
+ parent->add_dependencies(-1);
+ }
+
+ /*! run this task */
+ dll_export void TaskScheduler::Task::run (Thread& thread) {
+ run_internal(thread);
+ }
+
+ bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+ {
+ /* stop if we run out of local tasks or reach the waiting task */
+ if (right == 0 || &tasks[right-1] == parent)
+ return false;
+
+ /* execute task */
+ size_t oldRight = right;
+ tasks[right-1].run_internal(thread);
+ if (right != oldRight) {
+ THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+ }
+
+ /* pop task and closure from stack */
+ right--;
+ if (tasks[right].stackPtr != size_t(-1))
+ stackPtr = tasks[right].stackPtr;
+
+ /* also move left pointer */
+ if (left >= right) left.store(right.load());
+
+ return right != 0;
+ }
+
+ dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+ return execute_local_internal(thread,parent);
+ }
+
+ bool TaskScheduler::TaskQueue::steal(Thread& thread)
+ {
+ size_t l = left;
+ size_t r = right;
+ if (l < r)
+ {
+ l = left++;
+ if (l >= r)
+ return false;
+ }
+ else
+ return false;
+
+ if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+ return false;
+
+ thread.tasks.right++;
+ return true;
+ }
+
+ /* we steal from the left */
+ size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+ {
+ if (left >= right) return 0;
+ return tasks[left].N;
+ }
+
+ void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+ {
+ TaskScheduler::ThreadPool* pool = pair->first;
+ size_t threadIndex = pair->second;
+ delete pair;
+ pool->thread_loop(threadIndex);
+ }
+
+ TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+ : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+ dll_export void TaskScheduler::ThreadPool::startThreads()
+ {
+ if (running) return;
+ setNumThreads(numThreads,true);
+ }
+
+ void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+ {
+ Lock<MutexSys> lock(g_mutex);
+ assert(newNumThreads);
+ newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+ numThreads = newNumThreads;
+ if (!startThreads && !running) return;
+ running = true;
+ size_t numThreadsActive = numThreadsRunning;
+
+ mutex.lock();
+ numThreadsRunning = newNumThreads;
+ mutex.unlock();
+ condition.notify_all();
+
+ /* start new threads */
+ for (size_t t=numThreadsActive; t<numThreads; t++)
+ {
+ if (t == 0) continue;
+ auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+ threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+ }
+
+ /* stop some threads if we reduce the number of threads */
+ for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+ if (t == 0) continue;
+ embree::join(threads.back());
+ threads.pop_back();
+ }
+ }
+
+ TaskScheduler::ThreadPool::~ThreadPool()
+ {
+ /* leave all taskschedulers */
+ mutex.lock();
+ numThreadsRunning = 0;
+ mutex.unlock();
+ condition.notify_all();
+
+ /* wait for threads to terminate */
+ for (size_t i=0; i<threads.size(); i++)
+ embree::join(threads[i]);
+ }
+
+ dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+ {
+ mutex.lock();
+ schedulers.push_back(scheduler);
+ mutex.unlock();
+ condition.notify_all();
+ }
+
+ dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+ {
+ Lock<MutexSys> lock(mutex);
+ for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+ if (scheduler == *it) {
+ schedulers.erase(it);
+ return;
+ }
+ }
+ }
+
+ void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+ {
+ while (globalThreadIndex < numThreadsRunning)
+ {
+ Ref<TaskScheduler> scheduler = NULL;
+ ssize_t threadIndex = -1;
+ {
+ Lock<MutexSys> lock(mutex);
+ condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+ if (globalThreadIndex >= numThreadsRunning) break;
+ scheduler = schedulers.front();
+ threadIndex = scheduler->allocThreadIndex();
+ }
+ scheduler->thread_loop(threadIndex);
+ }
+ }
+
+ TaskScheduler::TaskScheduler()
+ : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+ {
+ threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+ for (size_t i=0; i<threadLocal.size(); i++)
+ threadLocal[i].store(nullptr);
+ }
+
+ TaskScheduler::~TaskScheduler()
+ {
+ assert(threadCounter == 0);
+ }
+
+ dll_export size_t TaskScheduler::threadID()
+ {
+ Thread* thread = TaskScheduler::thread();
+ if (thread) return thread->threadIndex;
+ else return 0;
+ }
+
+ dll_export size_t TaskScheduler::threadIndex()
+ {
+ Thread* thread = TaskScheduler::thread();
+ if (thread) return thread->threadIndex;
+ else return 0;
+ }
+
+ dll_export size_t TaskScheduler::threadCount() {
+ return threadPool->size();
+ }
+
+ dll_export TaskScheduler* TaskScheduler::instance()
+ {
+ if (g_instance == NULL) {
+ Lock<MutexSys> lock(g_mutex);
+ g_instance = new TaskScheduler;
+ g_instance_vector.push_back(g_instance);
+ }
+ return g_instance;
+ }
+
+ void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+ {
+ if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+ threadPool->setNumThreads(numThreads,start_threads);
+ }
+
+ void TaskScheduler::destroy() {
+ delete threadPool; threadPool = nullptr;
+ }
+
+ dll_export ssize_t TaskScheduler::allocThreadIndex()
+ {
+ size_t threadIndex = threadCounter++;
+ assert(threadIndex < threadLocal.size());
+ return threadIndex;
+ }
+
+ void TaskScheduler::join()
+ {
+ mutex.lock();
+ size_t threadIndex = allocThreadIndex();
+ condition.wait(mutex, [&] () { return hasRootTask.load(); });
+ mutex.unlock();
+ // -- GODOT start --
+ // std::exception_ptr except = thread_loop(threadIndex);
+ // if (except != nullptr) std::rethrow_exception(except);
+ thread_loop(threadIndex);
+ // -- GODOT end --
+ }
+
+ void TaskScheduler::reset() {
+ hasRootTask = false;
+ }
+
+ void TaskScheduler::wait_for_threads(size_t threadCount)
+ {
+ while (threadCounter < threadCount-1)
+ pause_cpu();
+ }
+
+ dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+ return thread_local_thread;
+ }
+
+ dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+ {
+ Thread* old = thread_local_thread;
+ thread_local_thread = thread;
+ return old;
+ }
+
+ dll_export bool TaskScheduler::wait()
+ {
+ Thread* thread = TaskScheduler::thread();
+ if (thread == nullptr) return true;
+ while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+ return thread->scheduler->cancellingException == nullptr;
+ }
+
+// -- GODOT start --
+// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+ void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+ {
+ /* allocate thread structure */
+ std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+ Thread& thread = *mthread;
+ threadLocal[threadIndex].store(&thread);
+ Thread* oldThread = swapThread(&thread);
+
+ /* main thread loop */
+ while (anyTasksRunning)
+ {
+ steal_loop(thread,
+ [&] () { return anyTasksRunning > 0; },
+ [&] () {
+ anyTasksRunning++;
+ while (thread.tasks.execute_local_internal(thread,nullptr));
+ anyTasksRunning--;
+ });
+ }
+ threadLocal[threadIndex].store(nullptr);
+ swapThread(oldThread);
+
+ /* remember exception to throw */
+ // -- GODOT start --
+ // std::exception_ptr except = nullptr;
+ // if (cancellingException != nullptr) except = cancellingException;
+ // -- GODOT end --
+ /* wait for all threads to terminate */
+ threadCounter--;
+#if defined(__WIN32__)
+ size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+ while (threadCounter > 0) {
+#if defined(__WIN32__)
+ if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+ yield();
+ else
+ _mm_pause();
+ loopIndex++;
+#else
+ yield();
+#endif
+ }
+ // -- GODOT start --
+ // return except;
+ return;
+ // -- GODOT end --
+ }
+
+ bool TaskScheduler::steal_from_other_threads(Thread& thread)
+ {
+ const size_t threadIndex = thread.threadIndex;
+ const size_t threadCount = this->threadCounter;
+
+ for (size_t i=1; i<threadCount; i++)
+ {
+ pause_cpu(32);
+ size_t otherThreadIndex = threadIndex+i;
+ if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+ Thread* othread = threadLocal[otherThreadIndex].load();
+ if (!othread)
+ continue;
+
+ if (othread->tasks.steal(thread))
+ return true;
+ }
+
+ return false;
+ }
+
+ dll_export void TaskScheduler::startThreads() {
+ threadPool->startThreads();
+ }
+
+ dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+ threadPool->add(scheduler);
+ }
+
+ dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+ threadPool->remove(scheduler);
+ }
+
+ RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8fa6bb12fa
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,385 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+ /* The tasking system exports some symbols to be used by the tutorials. Thus we
+ hide is also in the API namespace when requested. */
+ RTC_NAMESPACE_BEGIN
+
+ struct TaskScheduler : public RefCount
+ {
+ ALIGNED_STRUCT_(64);
+ friend class Device;
+
+ static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack
+ static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures
+
+ struct Thread;
+
+ /*! virtual interface for all tasks */
+ struct TaskFunction {
+ virtual void execute() = 0;
+ };
+
+ /*! builds a task interface from a closure */
+ template<typename Closure>
+ struct ClosureTaskFunction : public TaskFunction
+ {
+ Closure closure;
+ __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+ void execute() { closure(); };
+ };
+
+ struct __aligned(64) Task
+ {
+ /*! states a task can be in */
+ enum { DONE, INITIALIZED };
+
+ /*! switch from one state to another */
+ __forceinline void switch_state(int from, int to)
+ {
+ __memory_barrier();
+ MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+ assert(success);
+ }
+
+ /*! try to switch from one state to another */
+ __forceinline bool try_switch_state(int from, int to) {
+ __memory_barrier();
+ return state.compare_exchange_strong(from,to);
+ }
+
+ /*! increment/decrement dependency counter */
+ void add_dependencies(int n) {
+ dependencies+=n;
+ }
+
+ /*! initialize all tasks to DONE state by default */
+ __forceinline Task()
+ : state(DONE) {}
+
+ /*! construction of new task */
+ __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+ : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+ {
+ if (parent) parent->add_dependencies(+1);
+ switch_state(DONE,INITIALIZED);
+ }
+
+ /*! construction of stolen task, stealing thread will decrement initial dependency */
+ __forceinline Task (TaskFunction* closure, Task* parent)
+ : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+ {
+ switch_state(DONE,INITIALIZED);
+ }
+
+ /*! try to steal this task */
+ bool try_steal(Task& child)
+ {
+ if (!stealable) return false;
+ if (!try_switch_state(INITIALIZED,DONE)) return false;
+ new (&child) Task(closure, this);
+ return true;
+ }
+
+ /*! run this task */
+ dll_export void run(Thread& thread);
+
+ void run_internal(Thread& thread);
+
+ public:
+ std::atomic<int> state; //!< state this task is in
+ std::atomic<int> dependencies; //!< dependencies to wait for
+ std::atomic<bool> stealable; //!< true if task can be stolen
+ TaskFunction* closure; //!< the closure to execute
+ Task* parent; //!< parent task to signal when we are finished
+ size_t stackPtr; //!< stack location where closure is stored
+ size_t N; //!< approximative size of task
+ };
+
+ struct TaskQueue
+ {
+ TaskQueue ()
+ : left(0), right(0), stackPtr(0) {}
+
+ __forceinline void* alloc(size_t bytes, size_t align = 64)
+ {
+ size_t ofs = bytes + ((align - stackPtr) & (align-1));
+ if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+ // -- GODOT start --
+ // throw std::runtime_error("closure stack overflow");
+ abort();
+ // -- GODOT end --
+ stackPtr += ofs;
+ return &stack[stackPtr-bytes];
+ }
+
+ template<typename Closure>
+ __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+ {
+ if (right >= TASK_STACK_SIZE)
+ // -- GODOT start --
+ // throw std::runtime_error("task stack overflow");
+ abort();
+ // -- GODOT end --
+
+ /* allocate new task on right side of stack */
+ size_t oldStackPtr = stackPtr;
+ TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+ new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
+ right++;
+
+ /* also move left pointer */
+ if (left >= right-1) left = right-1;
+ }
+
+ dll_export bool execute_local(Thread& thread, Task* parent);
+ bool execute_local_internal(Thread& thread, Task* parent);
+ bool steal(Thread& thread);
+ size_t getTaskSizeAtLeft();
+
+ bool empty() { return right == 0; }
+
+ public:
+
+ /* task stack */
+ Task tasks[TASK_STACK_SIZE];
+ __aligned(64) std::atomic<size_t> left; //!< threads steal from left
+ __aligned(64) std::atomic<size_t> right; //!< new tasks are added to the right
+
+ /* closure stack */
+ __aligned(64) char stack[CLOSURE_STACK_SIZE];
+ size_t stackPtr;
+ };
+
+ /*! thread local structure for each thread */
+ struct Thread
+ {
+ ALIGNED_STRUCT_(64);
+
+ Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+ : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+ __forceinline size_t threadCount() {
+ return scheduler->threadCounter;
+ }
+
+ size_t threadIndex; //!< ID of this thread
+ TaskQueue tasks; //!< local task queue
+ Task* task; //!< current active task
+ Ref<TaskScheduler> scheduler; //!< pointer to task scheduler
+ };
+
+ /*! pool of worker threads */
+ struct ThreadPool
+ {
+ ThreadPool (bool set_affinity);
+ ~ThreadPool ();
+
+ /*! starts the threads */
+ dll_export void startThreads();
+
+ /*! sets number of threads to use */
+ void setNumThreads(size_t numThreads, bool startThreads = false);
+
+ /*! adds a task scheduler object for scheduling */
+ dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+ /*! remove the task scheduler object again */
+ dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+ /*! returns number of threads of the thread pool */
+ size_t size() const { return numThreads; }
+
+ /*! main loop for all threads */
+ void thread_loop(size_t threadIndex);
+
+ private:
+ std::atomic<size_t> numThreads;
+ std::atomic<size_t> numThreadsRunning;
+ bool set_affinity;
+ std::atomic<bool> running;
+ std::vector<thread_t> threads;
+
+ private:
+ MutexSys mutex;
+ ConditionSys condition;
+ std::list<Ref<TaskScheduler> > schedulers;
+ };
+
+ TaskScheduler ();
+ ~TaskScheduler ();
+
+ /*! initializes the task scheduler */
+ static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+ /*! destroys the task scheduler again */
+ static void destroy();
+
+ /*! lets new worker threads join the tasking system */
+ void join();
+ void reset();
+
+ /*! let a worker thread allocate a thread index */
+ dll_export ssize_t allocThreadIndex();
+
+ /*! wait for some number of threads available (threadCount includes main thread) */
+ void wait_for_threads(size_t threadCount);
+
+ /*! thread loop for all worker threads */
+ // -- GODOT start --
+ // std::exception_ptr thread_loop(size_t threadIndex);
+ void thread_loop(size_t threadIndex);
+ // -- GODOT end --
+
+ /*! steals a task from a different thread */
+ bool steal_from_other_threads(Thread& thread);
+
+ template<typename Predicate, typename Body>
+ static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+ /* spawn a new task at the top of the threads task stack */
+ template<typename Closure>
+ void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+ {
+ if (useThreadPool) startThreads();
+
+ size_t threadIndex = allocThreadIndex();
+ std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+ Thread& thread = *mthread;
+ assert(threadLocal[threadIndex].load() == nullptr);
+ threadLocal[threadIndex] = &thread;
+ Thread* oldThread = swapThread(&thread);
+ thread.tasks.push_right(thread,size,closure);
+ {
+ Lock<MutexSys> lock(mutex);
+ anyTasksRunning++;
+ hasRootTask = true;
+ condition.notify_all();
+ }
+
+ if (useThreadPool) addScheduler(this);
+
+ while (thread.tasks.execute_local(thread,nullptr));
+ anyTasksRunning--;
+ if (useThreadPool) removeScheduler(this);
+
+ threadLocal[threadIndex] = nullptr;
+ swapThread(oldThread);
+
+ /* remember exception to throw */
+ std::exception_ptr except = nullptr;
+ if (cancellingException != nullptr) except = cancellingException;
+
+ /* wait for all threads to terminate */
+ threadCounter--;
+ while (threadCounter > 0) yield();
+ cancellingException = nullptr;
+
+ /* re-throw proper exception */
+ if (except != nullptr)
+ std::rethrow_exception(except);
+ }
+
+ /* spawn a new task at the top of the threads task stack */
+ template<typename Closure>
+ static __forceinline void spawn(size_t size, const Closure& closure)
+ {
+ Thread* thread = TaskScheduler::thread();
+ if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+ else instance()->spawn_root(closure,size);
+ }
+
+ /* spawn a new task at the top of the threads task stack */
+ template<typename Closure>
+ static __forceinline void spawn(const Closure& closure) {
+ spawn(1,closure);
+ }
+
+ /* spawn a new task set */
+ template<typename Index, typename Closure>
+ static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+ {
+ spawn(end-begin, [=]()
+ {
+ if (end-begin <= blockSize) {
+ return closure(range<Index>(begin,end));
+ }
+ const Index center = (begin+end)/2;
+ spawn(begin,center,blockSize,closure);
+ spawn(center,end ,blockSize,closure);
+ wait();
+ });
+ }
+
+ /* work on spawned subtasks and wait until all have finished */
+ dll_export static bool wait();
+
+ /* returns the ID of the current thread */
+ dll_export static size_t threadID();
+
+ /* returns the index (0..threadCount-1) of the current thread */
+ dll_export static size_t threadIndex();
+
+ /* returns the total number of threads */
+ dll_export static size_t threadCount();
+
+ private:
+
+ /* returns the thread local task list of this worker thread */
+ dll_export static Thread* thread();
+
+ /* sets the thread local task list of this worker thread */
+ dll_export static Thread* swapThread(Thread* thread);
+
+ /*! returns the taskscheduler object to be used by the master thread */
+ dll_export static TaskScheduler* instance();
+
+ /*! starts the threads */
+ dll_export static void startThreads();
+
+ /*! adds a task scheduler object for scheduling */
+ dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+ /*! remove the task scheduler object again */
+ dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+ private:
+ std::vector<atomic<Thread*>> threadLocal;
+ std::atomic<size_t> threadCounter;
+ std::atomic<size_t> anyTasksRunning;
+ std::atomic<bool> hasRootTask;
+ std::exception_ptr cancellingException;
+ MutexSys mutex;
+ ConditionSys condition;
+
+ private:
+ static size_t g_numThreads;
+ static __thread TaskScheduler* g_instance;
+ static __thread Thread* thread_local_thread;
+ static ThreadPool* threadPool;
+ };
+
+ RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+ using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerppl.h b/thirdparty/embree/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..cbc2ecdbb8
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+ struct TaskScheduler
+ {
+ /*! initializes the task scheduler */
+ static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+ /*! destroys the task scheduler again */
+ static void destroy();
+
+ /* returns the ID of the current thread */
+ static __forceinline size_t threadID() {
+ return GetCurrentThreadId();
+ }
+
+ /* returns the index (0..threadCount-1) of the current thread */
+ /* FIXME: threadIndex is NOT supported by PPL! */
+ static __forceinline size_t threadIndex() {
+ return 0;
+ }
+
+ /* returns the total number of threads */
+ static __forceinline size_t threadCount() {
+ return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+ }
+ };
+};
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..35bd49849f
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -0,0 +1,73 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+# define NOMINMAX
+// -- GODOT start --
+#endif
+// -- GODOT end --
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+ struct TaskScheduler
+ {
+ /*! initializes the task scheduler */
+ static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+ /*! destroys the task scheduler again */
+ static void destroy();
+
+ /* returns the ID of the current thread */
+ static __forceinline size_t threadID()
+ {
+ return threadIndex();
+ }
+
+ /* returns the index (0..threadCount-1) of the current thread */
+ static __forceinline size_t threadIndex()
+ {
+#if TBB_INTERFACE_VERSION >= 9100
+ return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+ return tbb::task_arena::current_thread_index();
+#else
+ return 0;
+#endif
+ }
+
+ /* returns the total number of threads */
+ static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+ return tbb::this_task_arena::max_concurrency();
+#else
+ return tbb::task_scheduler_init::default_num_threads();
+#endif
+ }
+
+ };
+
+};