From 34b3e8f9e2ae076990ecf3b2827eff759ba2abf9 Mon Sep 17 00:00:00 2001 From: jfons Date: Tue, 20 Apr 2021 18:38:09 +0200 Subject: Add Embree-aarch64 thirdparty library --- .../common/algorithms/parallel_any_of.h | 55 + .../common/algorithms/parallel_filter.cpp | 56 + .../common/algorithms/parallel_filter.h | 93 ++ .../common/algorithms/parallel_for.cpp | 48 + .../common/algorithms/parallel_for.h | 229 +++ .../common/algorithms/parallel_for_for.cpp | 63 + .../common/algorithms/parallel_for_for.h | 149 ++ .../algorithms/parallel_for_for_prefix_sum.cpp | 85 + .../algorithms/parallel_for_for_prefix_sum.h | 112 ++ .../common/algorithms/parallel_map.cpp | 47 + .../common/algorithms/parallel_map.h | 85 + .../common/algorithms/parallel_partition.cpp | 53 + .../common/algorithms/parallel_partition.h | 283 ++++ .../common/algorithms/parallel_prefix_sum.cpp | 48 + .../common/algorithms/parallel_prefix_sum.h | 85 + .../common/algorithms/parallel_reduce.cpp | 49 + .../common/algorithms/parallel_reduce.h | 150 ++ .../common/algorithms/parallel_set.cpp | 43 + .../common/algorithms/parallel_set.h | 52 + .../common/algorithms/parallel_sort.cpp | 50 + .../common/algorithms/parallel_sort.h | 457 +++++ .../embree-aarch64/common/lexers/parsestream.h | 101 ++ thirdparty/embree-aarch64/common/lexers/stream.h | 215 +++ .../embree-aarch64/common/lexers/streamfilters.h | 39 + .../embree-aarch64/common/lexers/stringstream.cpp | 51 + .../embree-aarch64/common/lexers/stringstream.h | 29 + .../embree-aarch64/common/lexers/tokenstream.cpp | 181 ++ .../embree-aarch64/common/lexers/tokenstream.h | 164 ++ thirdparty/embree-aarch64/common/math/AVX2NEON.h | 986 +++++++++++ thirdparty/embree-aarch64/common/math/SSE2NEON.h | 1753 ++++++++++++++++++++ .../embree-aarch64/common/math/affinespace.h | 361 ++++ thirdparty/embree-aarch64/common/math/bbox.h | 331 ++++ thirdparty/embree-aarch64/common/math/col3.h | 47 + thirdparty/embree-aarch64/common/math/col4.h | 47 + thirdparty/embree-aarch64/common/math/color.h | 257 +++ .../embree-aarch64/common/math/constants.cpp | 61 + thirdparty/embree-aarch64/common/math/constants.h | 239 +++ thirdparty/embree-aarch64/common/math/interval.h | 161 ++ thirdparty/embree-aarch64/common/math/lbbox.h | 289 ++++ .../embree-aarch64/common/math/linearspace2.h | 148 ++ .../embree-aarch64/common/math/linearspace3.h | 213 +++ thirdparty/embree-aarch64/common/math/math.h | 451 +++++ thirdparty/embree-aarch64/common/math/obbox.h | 39 + thirdparty/embree-aarch64/common/math/quaternion.h | 254 +++ thirdparty/embree-aarch64/common/math/range.h | 137 ++ .../embree-aarch64/common/math/transcendental.h | 525 ++++++ thirdparty/embree-aarch64/common/math/vec2.h | 235 +++ thirdparty/embree-aarch64/common/math/vec2fa.h | 317 ++++ thirdparty/embree-aarch64/common/math/vec3.h | 349 ++++ thirdparty/embree-aarch64/common/math/vec3ba.h | 120 ++ thirdparty/embree-aarch64/common/math/vec3fa.h | 810 +++++++++ thirdparty/embree-aarch64/common/math/vec3ia.h | 210 +++ thirdparty/embree-aarch64/common/math/vec4.h | 258 +++ thirdparty/embree-aarch64/common/simd/avx.h | 34 + thirdparty/embree-aarch64/common/simd/avx512.h | 41 + thirdparty/embree-aarch64/common/simd/simd.h | 110 ++ thirdparty/embree-aarch64/common/simd/sse.cpp | 34 + thirdparty/embree-aarch64/common/simd/sse.h | 35 + thirdparty/embree-aarch64/common/simd/varying.h | 132 ++ .../embree-aarch64/common/simd/vboold4_avx.h | 160 ++ .../embree-aarch64/common/simd/vboold4_avx512.h | 140 ++ .../embree-aarch64/common/simd/vboold8_avx512.h | 148 ++ .../embree-aarch64/common/simd/vboolf16_avx512.h | 150 ++ .../embree-aarch64/common/simd/vboolf4_avx512.h | 143 ++ .../embree-aarch64/common/simd/vboolf4_sse2.h | 198 +++ .../embree-aarch64/common/simd/vboolf8_avx.h | 189 +++ .../embree-aarch64/common/simd/vboolf8_avx512.h | 143 ++ .../embree-aarch64/common/simd/vdouble4_avx.h | 324 ++++ .../embree-aarch64/common/simd/vdouble8_avx512.h | 356 ++++ .../embree-aarch64/common/simd/vfloat16_avx512.h | 771 +++++++++ .../embree-aarch64/common/simd/vfloat4_sse2.h | 925 +++++++++++ .../embree-aarch64/common/simd/vfloat8_avx.h | 847 ++++++++++ .../embree-aarch64/common/simd/vint16_avx512.h | 490 ++++++ thirdparty/embree-aarch64/common/simd/vint4_sse2.h | 681 ++++++++ thirdparty/embree-aarch64/common/simd/vint8_avx.h | 464 ++++++ thirdparty/embree-aarch64/common/simd/vint8_avx2.h | 512 ++++++ .../embree-aarch64/common/simd/vllong4_avx2.h | 358 ++++ .../embree-aarch64/common/simd/vllong8_avx512.h | 381 +++++ .../embree-aarch64/common/simd/vuint16_avx512.h | 443 +++++ .../embree-aarch64/common/simd/vuint4_sse2.h | 499 ++++++ thirdparty/embree-aarch64/common/simd/vuint8_avx.h | 379 +++++ .../embree-aarch64/common/simd/vuint8_avx2.h | 439 +++++ thirdparty/embree-aarch64/common/sys/alloc.cpp | 327 ++++ thirdparty/embree-aarch64/common/sys/alloc.h | 164 ++ thirdparty/embree-aarch64/common/sys/array.h | 222 +++ thirdparty/embree-aarch64/common/sys/atomic.h | 59 + thirdparty/embree-aarch64/common/sys/barrier.cpp | 289 ++++ thirdparty/embree-aarch64/common/sys/barrier.h | 112 ++ thirdparty/embree-aarch64/common/sys/condition.cpp | 81 + thirdparty/embree-aarch64/common/sys/condition.h | 31 + thirdparty/embree-aarch64/common/sys/filename.cpp | 138 ++ thirdparty/embree-aarch64/common/sys/filename.h | 81 + thirdparty/embree-aarch64/common/sys/intrinsics.h | 559 +++++++ thirdparty/embree-aarch64/common/sys/library.cpp | 83 + thirdparty/embree-aarch64/common/sys/library.h | 21 + thirdparty/embree-aarch64/common/sys/mutex.cpp | 58 + thirdparty/embree-aarch64/common/sys/mutex.h | 98 ++ thirdparty/embree-aarch64/common/sys/platform.h | 387 +++++ thirdparty/embree-aarch64/common/sys/ref.h | 122 ++ .../embree-aarch64/common/sys/regression.cpp | 30 + thirdparty/embree-aarch64/common/sys/regression.h | 25 + thirdparty/embree-aarch64/common/sys/string.cpp | 42 + thirdparty/embree-aarch64/common/sys/string.h | 37 + thirdparty/embree-aarch64/common/sys/sysinfo.cpp | 676 ++++++++ thirdparty/embree-aarch64/common/sys/sysinfo.h | 192 +++ thirdparty/embree-aarch64/common/sys/thread.cpp | 429 +++++ thirdparty/embree-aarch64/common/sys/thread.h | 46 + thirdparty/embree-aarch64/common/sys/vector.h | 242 +++ .../embree-aarch64/common/tasking/taskscheduler.h | 17 + .../common/tasking/taskschedulergcd.h | 49 + .../common/tasking/taskschedulerinternal.cpp | 426 +++++ .../common/tasking/taskschedulerinternal.h | 386 +++++ .../common/tasking/taskschedulerppl.h | 46 + .../common/tasking/taskschedulertbb.h | 67 + 114 files changed, 26738 insertions(+) create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_filter.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_map.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_partition.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_set.h create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp create mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_sort.h create mode 100644 thirdparty/embree-aarch64/common/lexers/parsestream.h create mode 100644 thirdparty/embree-aarch64/common/lexers/stream.h create mode 100644 thirdparty/embree-aarch64/common/lexers/streamfilters.h create mode 100644 thirdparty/embree-aarch64/common/lexers/stringstream.cpp create mode 100644 thirdparty/embree-aarch64/common/lexers/stringstream.h create mode 100644 thirdparty/embree-aarch64/common/lexers/tokenstream.cpp create mode 100644 thirdparty/embree-aarch64/common/lexers/tokenstream.h create mode 100644 thirdparty/embree-aarch64/common/math/AVX2NEON.h create mode 100644 thirdparty/embree-aarch64/common/math/SSE2NEON.h create mode 100644 thirdparty/embree-aarch64/common/math/affinespace.h create mode 100644 thirdparty/embree-aarch64/common/math/bbox.h create mode 100644 thirdparty/embree-aarch64/common/math/col3.h create mode 100644 thirdparty/embree-aarch64/common/math/col4.h create mode 100644 thirdparty/embree-aarch64/common/math/color.h create mode 100644 thirdparty/embree-aarch64/common/math/constants.cpp create mode 100644 thirdparty/embree-aarch64/common/math/constants.h create mode 100644 thirdparty/embree-aarch64/common/math/interval.h create mode 100644 thirdparty/embree-aarch64/common/math/lbbox.h create mode 100644 thirdparty/embree-aarch64/common/math/linearspace2.h create mode 100644 thirdparty/embree-aarch64/common/math/linearspace3.h create mode 100644 thirdparty/embree-aarch64/common/math/math.h create mode 100644 thirdparty/embree-aarch64/common/math/obbox.h create mode 100644 thirdparty/embree-aarch64/common/math/quaternion.h create mode 100644 thirdparty/embree-aarch64/common/math/range.h create mode 100644 thirdparty/embree-aarch64/common/math/transcendental.h create mode 100644 thirdparty/embree-aarch64/common/math/vec2.h create mode 100644 thirdparty/embree-aarch64/common/math/vec2fa.h create mode 100644 thirdparty/embree-aarch64/common/math/vec3.h create mode 100644 thirdparty/embree-aarch64/common/math/vec3ba.h create mode 100644 thirdparty/embree-aarch64/common/math/vec3fa.h create mode 100644 thirdparty/embree-aarch64/common/math/vec3ia.h create mode 100644 thirdparty/embree-aarch64/common/math/vec4.h create mode 100644 thirdparty/embree-aarch64/common/simd/avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/simd.h create mode 100644 thirdparty/embree-aarch64/common/simd/sse.cpp create mode 100644 thirdparty/embree-aarch64/common/simd/sse.h create mode 100644 thirdparty/embree-aarch64/common/simd/varying.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboold4_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboold4_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboold8_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboolf8_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vdouble4_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vfloat8_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vint16_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vint4_sse2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vint8_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vint8_avx2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vllong4_avx2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vllong8_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vuint16_avx512.h create mode 100644 thirdparty/embree-aarch64/common/simd/vuint4_sse2.h create mode 100644 thirdparty/embree-aarch64/common/simd/vuint8_avx.h create mode 100644 thirdparty/embree-aarch64/common/simd/vuint8_avx2.h create mode 100644 thirdparty/embree-aarch64/common/sys/alloc.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/alloc.h create mode 100644 thirdparty/embree-aarch64/common/sys/array.h create mode 100644 thirdparty/embree-aarch64/common/sys/atomic.h create mode 100644 thirdparty/embree-aarch64/common/sys/barrier.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/barrier.h create mode 100644 thirdparty/embree-aarch64/common/sys/condition.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/condition.h create mode 100644 thirdparty/embree-aarch64/common/sys/filename.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/filename.h create mode 100644 thirdparty/embree-aarch64/common/sys/intrinsics.h create mode 100644 thirdparty/embree-aarch64/common/sys/library.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/library.h create mode 100644 thirdparty/embree-aarch64/common/sys/mutex.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/mutex.h create mode 100644 thirdparty/embree-aarch64/common/sys/platform.h create mode 100644 thirdparty/embree-aarch64/common/sys/ref.h create mode 100644 thirdparty/embree-aarch64/common/sys/regression.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/regression.h create mode 100644 thirdparty/embree-aarch64/common/sys/string.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/string.h create mode 100644 thirdparty/embree-aarch64/common/sys/sysinfo.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/sysinfo.h create mode 100644 thirdparty/embree-aarch64/common/sys/thread.cpp create mode 100644 thirdparty/embree-aarch64/common/sys/thread.h create mode 100644 thirdparty/embree-aarch64/common/sys/vector.h create mode 100644 thirdparty/embree-aarch64/common/tasking/taskscheduler.h create mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h create mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp create mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h create mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h create mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h (limited to 'thirdparty/embree-aarch64/common') diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h new file mode 100644 index 0000000000..01f1f80f6c --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h @@ -0,0 +1,55 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "parallel_reduce.h" + +namespace embree +{ + + template + __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) + { + bool ret = false; + +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred,&context](const tbb::blocked_range& r) { + if (context.is_group_execution_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + context.cancel_group_execution(); + } + } + }); +#else + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred](const tbb::blocked_range& r) { + if (tbb::task::self().is_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + tbb::task::self().cancel_group_execution(); + } + } + }); +#endif +#else + ret = parallel_reduce (first, last, false, [pred](const range& r)->bool { + bool localret = false; + for (auto i=r.begin(); i() + ); +#endif + + return ret; + } + +} // end namespace diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp new file mode 100644 index 0000000000..acddc0ff81 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp @@ -0,0 +1,56 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_filter.h" +#include "../sys/regression.h" +#include + +namespace embree +{ + struct parallel_filter_regression_test : public RegressionTest + { + parallel_filter_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; }; + + for (size_t N=10; N<1000000; N=size_t(2.1*N)) + { + size_t N0 = rand() % N; + + /* initialize array with random numbers */ + std::vector src(N); + std::map m; + for (size_t i=0; i + inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) + { + Index j = first; + for (Index i=first; i + inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) + { + /* sequential fallback */ + if (end-begin <= minStepSize) + return sequential_filter(data,begin,end,predicate); + + /* calculate number of tasks to use */ + enum { MAX_TASKS = 64 }; + const Index numThreads = TaskScheduler::threadCount(); + const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; + const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); + + /* filter blocks */ + Index nused[MAX_TASKS]; + Index nfree[MAX_TASKS]; + parallel_for(taskCount, [&](const Index taskIndex) + { + const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; + const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; + const Index i2 = sequential_filter(data,i0,i1,predicate); + nused[taskIndex] = i2-i0; + nfree[taskIndex] = i1-i2; + }); + + /* calculate offsets */ + Index sused=0; + Index sfree=0; + Index pfree[MAX_TASKS]; + for (Index i=0; i0; i--) + { + if (k0 > r1) break; + Index k1 = k0+nused[i]; + Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; + for (Index i=max(r0,k0); i= begin && dst < end); + assert(isrc >= begin && isrc < end); + data[dst++] = data[isrc]; + } + k0 = k1; + } + }); + + return begin+sused; + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp new file mode 100644 index 0000000000..ef070ebc4d --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp @@ -0,0 +1,48 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_regression_test : public RegressionTest + { + parallel_for_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + const size_t M = 10; + for (size_t N=10; N<10000000; N=size_t(2.1*N)) + { + /* sequentially calculate sum of squares */ + size_t sum0 = 0; + for (size_t i=0; i sum1(0); + parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range& r) + { + size_t s = 0; + for (size_t i=r.begin(); i +#include +#include +#endif + +namespace embree +{ + /* parallel_for without range */ + template + __forceinline void parallel_for( const Index N, const Func& func) + { +#if defined(TASKING_INTERNAL) + if (N) { + TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range& r) { + assert(r.size() == 1); + func(r.begin()); + }); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + } +#elif defined(TASKING_GCD) && defined(BUILD_IOS) + + const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1; + const size_t length = N; + const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks; + const size_t numBlocks = (length + blockSize-1) / blockSize; + + dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { + + const size_t start = (currentBlock * blockSize); + const size_t blockLength = std::min(length - start, blockSize); + const size_t end = start + blockLength; + + for(size_t i=start; i < end; i++) + { + func(i); + } + }); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range and granulatity */ + template + __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) + { + assert(first <= last); +#if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + +#elif defined(TASKING_GCD) && defined(BUILD_IOS) + + const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1; + const size_t length = last - first; + const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks; + size_t blockSize = std::max(minStepSize,blockSizeByThreads); + blockSize += blockSize % 4; + + const size_t numBlocks = (length + blockSize-1) / blockSize; + + dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { + + const size_t start = first + (currentBlock * blockSize); + const size_t end = std::min(last, start + blockSize); + + func( embree::range(start,end) ); + }); + + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { + func(range(i,i+1)); + }); + +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range */ + template + __forceinline void parallel_for( const Index first, const Index last, const Func& func) + { + assert(first <= last); + parallel_for(first,last,(Index)1,func); + } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + + typedef tbb::affinity_partitioner affinity_partitioner; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + +#else + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + parallel_for(N,func); + } + + struct affinity_partitioner { + }; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) + { + parallel_for(N,func); + } + +#endif +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp new file mode 100644 index 0000000000..0337611b35 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp @@ -0,0 +1,63 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for_for.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_for_regression_test : public RegressionTest + { + parallel_for_for_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + size_t sum0 = 0; + size_t K = 0; + const size_t M = 1000; + std::vector* > array2(M); + for (size_t i=0; i(N); + for (size_t j=0; j> verify_k(K); + for (size_t i=0; i sum1(0); + parallel_for_for( array2, size_t(1), [&](std::vector* v, const range& r, size_t k) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + size_t k=0; + for (size_t i=0; i!=array2.size(); ++i) { + const size_t N = array2[i]->size(); + if (N) func(array2[i],range(0,N),k); + k+=N; + } + } + + class ParallelForForState + { + public: + + enum { MAX_TASKS = 64 }; + + __forceinline ParallelForForState () + : taskCount(0) {} + + template + __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { + init(array2,minStepSize); + } + + template + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + /* first calculate total number of elements */ + size_t N = 0; + for (size_t i=0; isize() : 0; + } + this->N = N; + + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (N+minStepSize-1)/minStepSize; + taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); + + /* calculate start (i,j) for each task */ + size_t taskIndex = 0; + i0[taskIndex] = 0; + j0[taskIndex] = 0; + size_t k0 = (++taskIndex)*N/taskCount; + for (size_t i=0, k=0; taskIndex < taskCount; i++) + { + assert(isize() : 0; + while (j= k0 && taskIndex < taskCount) { + assert(taskIndex + __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + ParallelForForState state(array2,minStepSize); + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) func(array2[i],range(r0,r1),k); + k+=r1-r0; j0 = 0; + } + }); + } + + template + __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) + { + parallel_for_for(array2,1,func); + } + + template + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + ParallelForForState state(array2,minStepSize); + Value temp[ParallelForForState::MAX_TASKS]; + + for (size_t i=0; isize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range(r0,r1),k)); + k+=r1-r0; j0 = 0; + } + }); + + Value ret = identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_reduce(array2,1,identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp new file mode 100644 index 0000000000..0169d8e481 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp @@ -0,0 +1,85 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for_for_prefix_sum.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_for_prefix_sum_regression_test : public RegressionTest + { + parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + const size_t M = 10; + std::vector> flattened; + typedef std::vector* > ArrayArray; + ArrayArray array2(M); + size_t K = 0; + for (size_t i=0; i(N); + for (size_t j=0; j> verify_k(K); + for (size_t i=0; i state(array2,size_t(1)); + + /* dry run only counts */ + size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector* v, const range& r, size_t k, size_t i) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i* v, const range& r, size_t k, size_t i, const size_t base) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + struct ParallelForForPrefixSumState : public ParallelForForState + { + __forceinline ParallelForForPrefixSumState () {} + + template + __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) + : ParallelForForState(array2,minStepSize) {} + + ParallelPrefixSumState prefix_state; + }; + + template + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i)); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); + } + + template + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp new file mode 100644 index 0000000000..09dc303f81 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_map.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_map_regression_test : public RegressionTest + { + parallel_map_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create key/value vectors with random numbers */ + const size_t N = 10000; + std::vector keys(N); + std::vector vals(N); + for (size_t i=0; i map; + map.init(keys,vals); + + /* check that all keys are properly mapped */ + for (size_t i=0; i + class parallel_map + { + /* key/value pair to build the map */ + struct KeyValue + { + __forceinline KeyValue () {} + + __forceinline KeyValue (const Key key, const Val val) + : key(key), val(val) {} + + __forceinline operator Key() const { + return key; + } + + public: + Key key; + Val val; + }; + + public: + + /*! parallel map constructors */ + parallel_map () {} + + /*! construction from pair of vectors */ + template + parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } + + /*! initialized the parallel map from a vector with keys and values */ + template + void init(const KeyVector& keys, const ValVector& values) + { + /* reserve sufficient space for all data */ + assert(keys.size() == values.size()); + vec.resize(keys.size()); + + /* generate key/value pairs */ + parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(keys.size()); + radix_sort(vec.data(),temp.data(),keys.size()); + } + + /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ + __forceinline const Val* lookup(const Key& key) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return nullptr; + if (i->key != key) return nullptr; + return &i->val; + } + + /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ + __forceinline Val lookup(const Key& key, const Val& def) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return def; + if (i->key != key) return def; + return i->val; + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp new file mode 100644 index 0000000000..eb20c4465d --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp @@ -0,0 +1,53 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_partition.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_partition_regression_test : public RegressionTest + { + parallel_partition_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + for (size_t i=0; i<100; i++) + { + /* create random permutation */ + size_t N = std::rand() % 1000000; + std::vector array(N); + for (unsigned i=0; i= split; + } + + return passed; + } + }; + + parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test"); +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h new file mode 100644 index 0000000000..3b3ad7c854 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h @@ -0,0 +1,283 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" +#include "../math/range.h" + +namespace embree +{ + /* serial partitioning */ + template + __forceinline size_t serial_partitioning(T* array, + const size_t begin, + const size_t end, + V& leftReduction, + V& rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t) + { + T* l = array + begin; + T* r = array + end - 1; + + while(1) + { + /* *l < pivot */ + while (likely(l <= r && is_left(*l) )) + { + //prefetchw(l+4); // FIXME: enable? + reduction_t(leftReduction,*l); + ++l; + } + /* *r >= pivot) */ + while (likely(l <= r && !is_left(*r))) + { + //prefetchw(r-4); FIXME: enable? + reduction_t(rightReduction,*r); + --r; + } + if (r + class __aligned(64) parallel_partition_task + { + ALIGNED_CLASS_(64); + private: + + static const size_t MAX_TASKS = 64; + + T* array; + size_t N; + const IsLeft& is_left; + const Reduction_T& reduction_t; + const Reduction_V& reduction_v; + const Vi& identity; + + size_t numTasks; + __aligned(64) size_t counter_start[MAX_TASKS+1]; + __aligned(64) size_t counter_left[MAX_TASKS+1]; + __aligned(64) range leftMisplacedRanges[MAX_TASKS]; + __aligned(64) range rightMisplacedRanges[MAX_TASKS]; + __aligned(64) V leftReductions[MAX_TASKS]; + __aligned(64) V rightReductions[MAX_TASKS]; + + public: + + __forceinline parallel_partition_task(T* array, + const size_t N, + const Vi& identity, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + const size_t BLOCK_SIZE) + + : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), + numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} + + __forceinline const range* findStartRange(size_t& index, const range* const r, const size_t numRanges) + { + size_t i = 0; + while(index >= (size_t)r[i].size()) + { + assert(i < numRanges); + index -= (size_t)r[i].size(); + i++; + } + return &r[i]; + } + + __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, + const size_t numRightMisplacedRanges, + const size_t startID, + const size_t endID) + { + size_t leftLocalIndex = startID; + size_t rightLocalIndex = startID; + const range* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); + const range* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); + + size_t l_left = l_range->size() - leftLocalIndex; + size_t r_left = r_range->size() - rightLocalIndex; + T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; + T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; + size_t size = endID - startID; + size_t items = min(size,min(l_left,r_left)); + + while (size) + { + if (unlikely(l_left == 0)) + { + l_range++; + l_left = l_range->size(); + l = &array[l_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + if (unlikely(r_left == 0)) + { + r_range++; + r_left = r_range->size(); + r = &array[r_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + size -= items; + l_left -= items; + r_left -= items; + + while(items) { + items--; + xchg(*l++,*r++); + } + } + } + + __forceinline size_t partition(V& leftReduction, V& rightReduction) + { + /* partition the individual ranges for each task */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*N/numTasks; + const size_t endID = (taskID+1)*N/numTasks; + V local_left(identity); + V local_right(identity); + const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); + counter_start[taskID] = startID; + counter_left [taskID] = mid-startID; + leftReductions[taskID] = local_left; + rightReductions[taskID] = local_right; + }); + counter_start[numTasks] = N; + counter_left[numTasks] = 0; + + /* finalize the reductions */ + for (size_t i=0; i globalLeft (0,mid); + const range globalRight(mid,N); + + /* calculate all left and right ranges that are on the wrong global side */ + size_t numMisplacedRangesLeft = 0; + size_t numMisplacedRangesRight = 0; + size_t numMisplacedItemsLeft = 0; + size_t numMisplacedItemsRight = 0; + + for (size_t i=0; i left_range (counter_start[i], counter_start[i] + counter_left[i]); + const range right_range(counter_start[i] + counter_left[i], counter_start[i+1]); + const range left_misplaced = globalLeft. intersect(right_range); + const range right_misplaced = globalRight.intersect(left_range); + + if (!left_misplaced.empty()) + { + numMisplacedItemsLeft += left_misplaced.size(); + leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; + } + + if (!right_misplaced.empty()) + { + numMisplacedItemsRight += right_misplaced.size(); + rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; + } + } + assert( numMisplacedItemsLeft == numMisplacedItemsRight ); + + /* if no items are misplaced we are done */ + if (numMisplacedItemsLeft == 0) + return mid; + + /* otherwise we copy the items to the right place in parallel */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; + const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; + swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); + }); + + return mid; + } + }; + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE = 128) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < BLOCK_SIZE)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE, + size_t PARALLEL_THRESHOLD) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < PARALLEL_THRESHOLD)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + + template + inline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const IsLeft& is_left, + size_t BLOCK_SIZE = 128) + { + size_t leftReduction = 0; + size_t rightReduction = 0; + return parallel_partitioning( + array,begin,end,0,leftReduction,rightReduction,is_left, + [] (size_t& t,const T& ref) { }, + [] (size_t& t0,size_t& t1) { }, + BLOCK_SIZE); + } + +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp new file mode 100644 index 0000000000..685952c3dc --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp @@ -0,0 +1,48 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_prefix_sum.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_prefix_sum_regression_test : public RegressionTest + { + parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + const size_t M = 10; + + for (size_t N=10; N<10000000; N=size_t(2.1*N)) + { + /* initialize array with random numbers */ + uint32_t sum0 = 0; + std::vector src(N); + for (size_t i=0; i dst(N); + for (auto& v : dst) v = 0; + + for (size_t i=0; i()); + passed &= (sum0 == sum1); + } + + /* check if prefix sum is correct */ + for (size_t i=0, sum=0; i + struct ParallelPrefixSumState + { + enum { MAX_TASKS = 64 }; + Value counts[MAX_TASKS]; + Value sums [MAX_TASKS]; + }; + + template + __forceinline Value parallel_prefix_sum( ParallelPrefixSumState& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; + const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState::MAX_TASKS)); + + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; + const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; + state.counts[taskIndex] = func(range(i0,i1),state.sums[taskIndex]); + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) + { + /* perform single threaded prefix operation for small N */ + if (N < SINGLE_THREAD_THRESHOLD) + { + Value sum=identity; + for (size_t i=0; i state; + + /* initial run just sets up start values for subtasks */ + parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i& r) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + const Index maxTasks = 512; + const Index threadCount = (Index) TaskScheduler::threadCount(); + taskCount = min(taskCount,threadCount,maxTasks); + + /* parallel invokation of all tasks */ + dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack + parallel_for(taskCount, [&](const Index taskIndex) { + const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; + const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; + values[taskIndex] = func(range(k0,k1)); + }); + + /* perform reduction over all tasks */ + Value v = identity; + for (Index i=0; i + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { +#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS)) + + /* fast path for small number of iterations */ + Index taskCount = (last-first+minStepSize-1)/minStepSize; + if (likely(taskCount == 1)) { + return func(range(first,last)); + } + return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction,context); + // -- GODOT start -- + // if (context.is_group_execution_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction); + // -- GODOT start -- + // if (tbb::task::self().is_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #endif +#else // TASKING_PPL + struct AlignedValue + { + char storage[__alignof(Value)+sizeof(Value)]; + static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; + Value* getValuePtr() { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + const Value* getValuePtr() const { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } + AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } + AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; + AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + operator Value() const { return *getValuePtr(); } + }; + + struct Iterator_Index + { + Index v; + typedef std::forward_iterator_tag iterator_category; + typedef AlignedValue value_type; + typedef Index difference_type; + typedef Index distance_type; + typedef AlignedValue* pointer; + typedef AlignedValue& reference; + __forceinline Iterator_Index() {} + __forceinline Iterator_Index(Index v) : v(v) {} + __forceinline bool operator== (Iterator_Index other) { return v == other.v; } + __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } + __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } + __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } + }; + + auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { + assert(begin.v < end.v); + return reduction(start, func(range(begin.v, end.v))); + }; + const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); + return v; +#endif + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + if (likely(last-first < parallel_threshold)) { + return func(range(first,last)); + } else { + return parallel_reduce(first,last,minStepSize,identity,func,reduction); + } + } + + template + __forceinline Value parallel_reduce( const range range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + auto funcr = [&] ( const range r ) { + Value v = identity; + for (Index i=r.begin(); i + __forceinline Value parallel_reduce( const range range, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp new file mode 100644 index 0000000000..20b639c1c9 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp @@ -0,0 +1,43 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_set.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_set_regression_test : public RegressionTest + { + parallel_set_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + const size_t N = 10000; + std::vector unsorted(N); + for (size_t i=0; i sorted; + sorted.init(unsorted); + + /* check that all elements are in the set */ + for (size_t i=0; i + class parallel_set + { + public: + + /*! default constructor for the parallel set */ + parallel_set () {} + + /*! construction from vector */ + template + parallel_set (const Vector& in) { init(in); } + + /*! initialized the parallel set from a vector */ + template + void init(const Vector& in) + { + /* copy data to internal vector */ + vec.resize(in.size()); + parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(in.size()); + radix_sort(vec.data(),temp.data(),vec.size()); + } + + /*! tests if some element is in the set */ + __forceinline bool lookup(const T& elt) const { + return std::binary_search(vec.begin(), vec.end(), elt); + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp new file mode 100644 index 0000000000..5e7ec79ac1 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp @@ -0,0 +1,50 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_sort.h" +#include "../sys/regression.h" + +namespace embree +{ + template + struct RadixSortRegressionTest : public RegressionTest + { + RadixSortRegressionTest(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + const size_t M = 10; + + for (size_t N=10; N<1000000; N=size_t(2.1*N)) + { + std::vector src(N); memset(src.data(),0,N*sizeof(Key)); + std::vector tmp(N); memset(tmp.data(),0,N*sizeof(Key)); + for (size_t i=0; i(src.data(),tmp.data(),N); + } + + /* calculate checksum */ + Key sum1 = 0; for (size_t i=0; i test_u32("RadixSortRegressionTestU32"); + RadixSortRegressionTest test_u64("RadixSortRegressionTestU64"); +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h new file mode 100644 index 0000000000..a758227c1b --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h @@ -0,0 +1,457 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../simd/simd.h" +#include "parallel_for.h" +#if defined(TASKING_GCD) && defined(BUILD_IOS) +#include "../sys/alloc.h" +#endif +#include + +namespace embree +{ + template + __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v < array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v > array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + void quicksort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_ascending(t, begin, pivot); + quicksort_ascending(t, pivot + 1, end); + } + } + + template + void quicksort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_decending(t, begin, pivot); + quicksort_decending(t, pivot + 1, end); + } + } + + + template + void quicksort_insertionsort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_ascending(&t[begin],size); + } + else + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_ascending(t, begin, pivot); + quicksort_insertionsort_ascending(t, pivot + 1, end); + } + } + } + + + template + void quicksort_insertionsort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_decending(&t[begin],size); + } + else + { + + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_decending(t, begin, pivot); + quicksort_insertionsort_decending(t, pivot + 1, end); + } + } + } + + template + static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) + { + static const unsigned int BITS = 8; + static const unsigned int BUCKETS = (1 << BITS); + static const unsigned int CMP_SORT_THRESHOLD = 16; + + __aligned(64) unsigned int count[BUCKETS]; + + /* clear buckets */ + for (size_t i=0;i> shift) & (BUCKETS-1)]++; + + /* prefix sums */ + __aligned(64) unsigned int head[BUCKETS]; + __aligned(64) unsigned int tail[BUCKETS]; + + head[0] = 0; + for (size_t i=1; i> shift) & (BUCKETS-1); + if (b == i) break; + std::swap(v,morton[head[b]++]); + } + assert((unsigned(v) >> shift & (BUCKETS-1)) == i); + morton[head[i]++] = v; + } + } + if (shift == 0) return; + + size_t offset = 0; + for (size_t i=0;i> shift) & (BUCKETS-1)) == i); + + if (unlikely(count[i] < CMP_SORT_THRESHOLD)) + insertionsort_ascending(morton + offset, count[i]); + else + radixsort32(morton + offset, count[i], shift-BITS); + + for (size_t j=offset;j + class ParallelRadixSort + { + static const size_t MAX_TASKS = 64; + static const size_t BITS = 8; + static const size_t BUCKETS = (1 << BITS); + typedef unsigned int TyRadixCount[BUCKETS]; + + template + static bool compare(const T& v0, const T& v1) { + return (Key)v0 < (Key)v1; + } + + private: + ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement + ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement + + + public: + ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) + : radixCount(nullptr), src(src), tmp(tmp), N(N) {} + + void sort(const size_t blockSize) + { + assert(blockSize > 0); + + /* perform single threaded sort for small N */ + if (N<=blockSize) // handles also special case of 0! + { + /* do inplace sort inside destination array */ + std::sort(src,src+N,compare); + } + + /* perform parallel sort for large N */ + else + { + const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); + tbbRadixSort(numThreads); + } + } + + ~ParallelRadixSort() + { + alignedFree(radixCount); + radixCount = nullptr; + } + + private: + + void tbbRadixIteration0(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* count how many items go into the buckets */ + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const Key index = ((Key)src[i] >> shift) & mask; +#endif + count[index]++; + } + } + + void tbbRadixIteration1(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* calculate total number of items for each bucket */ + __aligned(64) unsigned int total[BUCKETS]; + /* + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const size_t index = ((Key)src[i] >> shift) & mask; +#endif + dst[offset[index]++] = elt; + } + } + + void tbbRadixIteration(const Key shift, const bool last, + const Ty* __restrict src, Ty* __restrict dst, + const size_t numTasks) + { + affinity_partitioner ap; + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); + } + + void tbbRadixSort(const size_t numTasks) + { + radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); + + if (sizeof(Key) == sizeof(uint32_t)) { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,1,tmp,src,numTasks); + } + else if (sizeof(Key) == sizeof(uint64_t)) + { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,0,tmp,src,numTasks); + tbbRadixIteration(4*BITS,0,src,tmp,numTasks); + tbbRadixIteration(5*BITS,0,tmp,src,numTasks); + tbbRadixIteration(6*BITS,0,src,tmp,numTasks); + tbbRadixIteration(7*BITS,1,tmp,src,numTasks); + } + } + + private: + TyRadixCount* radixCount; + Ty* const src; + Ty* const tmp; + const size_t N; + }; + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } + + template + void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h new file mode 100644 index 0000000000..db46dc114f --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/parsestream.h @@ -0,0 +1,101 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stringstream.h" +#include "../sys/filename.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/col3.h" +#include "../math/color.h" + +namespace embree +{ + /*! helper class for simple command line parsing */ + class ParseStream : public Stream + { + public: + ParseStream (const Ref >& cin) : cin(cin) {} + + ParseStream (const Ref >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false) + : cin(new StringStream(cin,seps,endl,multiLine)) {} + + public: + ParseLocation location() { return cin->loc(); } + std::string next() { return cin->get(); } + + void force(const std::string& next) { + std::string token = getString(); + if (token != next) + THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); + } + + std::string getString() { + return get(); + } + + FileName getFileName() { + return FileName(get()); + } + + int getInt () { + return atoi(get().c_str()); + } + + Vec2i getVec2i() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + return Vec2i(x,y); + } + + Vec3ia getVec3ia() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + int z = atoi(get().c_str()); + return Vec3ia(x,y,z); + } + + float getFloat() { + return (float)atof(get().c_str()); + } + + Vec2f getVec2f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + return Vec2f(x,y); + } + + Vec3f getVec3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3f(x,y,z); + } + + Vec3fa getVec3fa() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3fa(x,y,z); + } + + Col3f getCol3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Col3f(x,y,z); + } + + Color getColor() { + float r = (float)atof(get().c_str()); + float g = (float)atof(get().c_str()); + float b = (float)atof(get().c_str()); + return Color(r,g,b); + } + + private: + Ref > cin; + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h new file mode 100644 index 0000000000..3f75677e68 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/ref.h" +#include "../sys/filename.h" +#include "../sys/string.h" + +#include +#include +#include +#include + +namespace embree +{ + /*! stores the location of a stream element in the source */ + class ParseLocation + { + public: + ParseLocation () : lineNumber(-1), colNumber(-1) {} + ParseLocation (std::shared_ptr fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) + : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} + + std::string str() const + { + std::string str = "unknown"; + if (fileName) str = *fileName; + if (lineNumber >= 0) str += " line " + toString(lineNumber); + if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); + return str; + } + + private: + std::shared_ptr fileName; /// name of the file (or stream) the token is from + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + }; + + /*! a stream class templated over the stream elements */ + template class Stream : public RefCount + { + enum { BUF_SIZE = 1024 }; + + private: + virtual T next() = 0; + virtual ParseLocation location() = 0; + __forceinline std::pair nextHelper() { + ParseLocation l = location(); + T v = next(); + return std::pair(v,l); + } + __forceinline void push_back(const std::pair& v) { + if (past+future == BUF_SIZE) pop_front(); + size_t end = (start+past+future++)%BUF_SIZE; + buffer[end] = v; + } + __forceinline void pop_front() { + if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); + start = (start+1)%BUF_SIZE; past--; + } + public: + Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} + virtual ~Stream() {} + + public: + + const ParseLocation& loc() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].second; + } + T get() { + if (future == 0) push_back(nextHelper()); + T t = buffer[(start+past)%BUF_SIZE].first; + past++; future--; + return t; + } + const T& peek() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].first; + } + const T& unget(size_t n = 1) { + if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); + past -= n; future += n; + return peek(); + } + void drop() { + if (future == 0) push_back(nextHelper()); + past++; future--; + } + private: + size_t start,past,future; + std::vector > buffer; + }; + + /*! warps an iostream stream */ + class StdStream : public Stream + { + public: + StdStream (std::istream& cin, const std::string& name = "std::stream") + : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} + ~StdStream() {} + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + int next() { + int c = cin.get(); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + private: + std::istream& cin; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; + + /*! creates a stream from a file */ + class FileStream : public Stream + { + public: + + FileStream (FILE* file, const std::string& name = "file") + : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} + + FileStream (const FileName& fileName) + : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(fileName.str()))) + { + file = fopen(fileName.c_str(),"r"); + if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); + } + ~FileStream() { if (file) fclose(file); } + + public: + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + + int next() { + int c = fgetc(file); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + FILE* file; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; + + /*! creates a stream from a string */ + class StrStream : public Stream + { + public: + + StrStream (const char* str) + : str(str), lineNumber(1), colNumber(0), charNumber(0) {} + + public: + ParseLocation location() { + return ParseLocation(std::shared_ptr(),lineNumber,colNumber,charNumber); + } + + int next() { + int c = str[charNumber]; + if (c == 0) return EOF; + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + const char* str; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + }; + + /*! creates a character stream from a command line */ + class CommandLineStream : public Stream + { + public: + CommandLineStream (int argc, char** argv, const std::string& name = "command line") + : i(0), j(0), charNumber(0), name(std::shared_ptr(new std::string(name))) + { + if (argc > 0) { + for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; + charNumber++; + } + for (ssize_t k=1; k args; + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h new file mode 100644 index 0000000000..25580a77b8 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/streamfilters.h @@ -0,0 +1,39 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /* removes all line comments from a stream */ + class LineCommentFilter : public Stream + { + public: + LineCommentFilter (const FileName& fileName, const std::string& lineComment) + : cin(new FileStream(fileName)), lineComment(lineComment) {} + LineCommentFilter (Ref > cin, const std::string& lineComment) + : cin(cin), lineComment(lineComment) {} + + ParseLocation location() { return cin->loc(); } + + int next() + { + /* look if the line comment starts here */ + for (size_t j=0; jpeek() != lineComment[j]) { cin->unget(j); goto not_found; } + cin->get(); + } + /* eat all characters until the end of the line (or file) */ + while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); + + not_found: + return cin->get(); + } + + private: + Ref > cin; + std::string lineComment; + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp new file mode 100644 index 0000000000..98dc80ad59 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp @@ -0,0 +1,51 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stringstream.h" + +namespace embree +{ + static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i >& cin, const std::string& seps, const std::string& endl, bool multiLine) + : cin(cin), endl(endl), multiLine(multiLine) + { + createCharMap(isSepMap,seps); + createCharMap(isValidCharMap,stringChars); + } + + std::string StringStream::next() + { + /* skip separators */ + while (cin->peek() != EOF) { + if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } + if (multiLine && cin->peek() == '\\') { + cin->drop(); + if (cin->peek() == '\n') { cin->drop(); continue; } + cin->unget(); + } + if (!isSeparator(cin->peek())) break; + cin->drop(); + } + + /* parse everything until the next separator */ + std::vector str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); + // -- GODOT start -- + // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); + if (!isValidChar(c)) abort(); + // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); + return std::string(str.data()); + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h new file mode 100644 index 0000000000..e6dbd4aecc --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stringstream.h @@ -0,0 +1,29 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /*! simple tokenizer that produces a string stream */ + class StringStream : public Stream + { + public: + StringStream(const Ref >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false); + public: + ParseLocation location() { return cin->loc(); } + std::string next(); + private: + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } + private: + Ref > cin; /*! source character stream */ + bool isSepMap[256]; /*! map for fast classification of separators */ + bool isValidCharMap[256]; /*! map for valid characters */ + std::string endl; /*! the token of the end of line */ + bool multiLine; /*! whether to parse lines wrapped with \ */ + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp new file mode 100644 index 0000000000..d05be65862 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp @@ -0,0 +1,181 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "tokenstream.h" +#include "../math/math.h" + +namespace embree +{ + /* shorthands for common sets of characters */ + const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; + const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const std::string TokenStream::numbers = "0123456789"; + const std::string TokenStream::separators = "\n\t\r "; + const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i >& cin, //< stream to read from + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector& symbols) //< symbols + : cin(cin), symbols(symbols) + { + createCharMap(isAlphaMap,alpha); + createCharMap(isSepMap,seps); + createCharMap(isStringCharMap,stringChars); + } + + bool TokenStream::decDigits(std::string& str_o) + { + bool ok = false; + std::string str; + if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::decDigits1(std::string& str_o) + { + bool ok = false; + std::string str; + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; else cin->unget(str.size()); + return ok; + } + + bool TokenStream::trySymbol(const std::string& symbol) + { + size_t pos = 0; + while (pos < symbol.size()) { + if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } + cin->drop(); pos++; + } + return true; + } + + bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) + { + for (size_t i=0; ipeek() == '.') { + str += (char)cin->get(); + decDigits(str); + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1.[2]E2 + } + else ok = true; // 1.[2] + } + else if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1E2 + } + } + else + { + if (cin->peek() == '.') { + str += (char)cin->get(); + if (decDigits(str)) { + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // .3E2 + } + else ok = true; // .3 + } + } + } + if (ok) { + token = Token((float)atof(str.c_str()),loc); + } + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { + std::string str; + if (decDigits(str)) { + token = Token(atoi(str.c_str()),loc); + return true; + } + return false; + } + + bool TokenStream::tryString(Token& token, const ParseLocation& loc) + { + std::string str; + if (cin->peek() != '\"') return false; + cin->drop(); + while (cin->peek() != '\"') { + const int c = cin->get(); + if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); + str += (char)c; + } + cin->drop(); + token = Token(str,Token::TY_STRING,loc); + return true; + } + + bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) + { + std::string str; + if (!isAlpha(cin->peek())) return false; + str += (char)cin->get(); + while (isAlphaNum(cin->peek())) str += (char)cin->get(); + token = Token(str,Token::TY_IDENTIFIER,loc); + return true; + } + + void TokenStream::skipSeparators() + { + /* skip separators */ + while (cin->peek() != EOF && isSeparator(cin->peek())) + cin->drop(); + } + + Token TokenStream::next() + { + Token token; + skipSeparators(); + ParseLocation loc = cin->loc(); + if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ + if (tryFloat (token,loc)) return token; /**< try to parse float */ + if (tryInt (token,loc)) return token; /**< try to parse integer */ + if (tryString (token,loc)) return token; /**< try to parse string */ + if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ + if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ + return Token((char)cin->get(),loc); /**< return invalid character token */ + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h new file mode 100644 index 0000000000..72a7b4f2f3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.h @@ -0,0 +1,164 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" +#include +#include + +namespace embree +{ + /*! token class */ + class Token + { + public: + + enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; + + Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} + Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} + Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} + Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} + Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} + + static Token Eof() { return Token(); } + static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } + static Token Str(std::string str) { return Token(str,TY_STRING); } + static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } + + char Char() const { + if (ty == TY_CHAR) return c; + THROW_RUNTIME_ERROR(loc.str()+": character expected"); + } + + int Int() const { + if (ty == TY_INT) return i; + THROW_RUNTIME_ERROR(loc.str()+": integer expected"); + } + + float Float(bool cast = true) const { + if (ty == TY_FLOAT) return f; + if (ty == TY_INT && cast) return (float)i; + THROW_RUNTIME_ERROR(loc.str()+": float expected"); + } + + std::string Identifier() const { + if (ty == TY_IDENTIFIER) return str; + THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); + } + + std::string String() const { + if (ty == TY_STRING) return str; + THROW_RUNTIME_ERROR(loc.str()+": string expected"); + } + + std::string Symbol() const { + if (ty == TY_SYMBOL) return str; + THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); + } + + const ParseLocation& Location() const { return loc; } + + friend bool operator==(const Token& a, const Token& b) + { + if (a.ty != b.ty) return false; + if (a.ty == TY_CHAR) return a.c == b.c; + if (a.ty == TY_INT) return a.i == b.i; + if (a.ty == TY_FLOAT) return a.f == b.f; + if (a.ty == TY_IDENTIFIER) return a.str == b.str; + if (a.ty == TY_STRING) return a.str == b.str; + if (a.ty == TY_SYMBOL) return a.str == b.str; + return true; + } + + friend bool operator!=(const Token& a, const Token& b) { + return !(a == b); + } + + friend bool operator <( const Token& a, const Token& b ) { + if (a.ty != b.ty) return (int)a.ty < (int)b.ty; + if (a.ty == TY_CHAR) return a.c < b.c; + if (a.ty == TY_INT) return a.i < b.i; + if (a.ty == TY_FLOAT) return a.f < b.f; + if (a.ty == TY_IDENTIFIER) return a.str < b.str; + if (a.ty == TY_STRING) return a.str < b.str; + if (a.ty == TY_SYMBOL) return a.str < b.str; + return false; + } + + friend std::ostream& operator<<(std::ostream& cout, const Token& t) + { + if (t.ty == TY_EOF) return cout << "eof"; + if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; + if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; + if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; + if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; + if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; + if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; + return cout << "unknown"; + } + + private: + Type ty; //< the type of the token + union { + char c; //< data for char tokens + int i; //< data for int tokens + float f; //< data for float tokens + }; + std::string str; //< data for string and identifier tokens + ParseLocation loc; //< the location the token is from + }; + + /*! build full tokenizer that takes list of valid characters and keywords */ + class TokenStream : public Stream + { + public: + + /*! shorthands for common sets of characters */ + static const std::string alpha; + static const std::string ALPHA; + static const std::string numbers; + static const std::string separators; + static const std::string stringChars; + + public: + TokenStream(const Ref >& cin, + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector& symbols = std::vector()); //< symbols + public: + ParseLocation location() { return cin->loc(); } + Token next(); + bool trySymbol(const std::string& symbol); + + private: + void skipSeparators(); + bool decDigits(std::string& str); + bool decDigits1(std::string& str); + bool trySymbols(Token& token, const ParseLocation& loc); + bool tryFloat(Token& token, const ParseLocation& loc); + bool tryInt(Token& token, const ParseLocation& loc); + bool tryString(Token& token, const ParseLocation& loc); + bool tryIdentifier(Token& token, const ParseLocation& loc); + + Ref > cin; + bool isSepMap[256]; + bool isAlphaMap[256]; + bool isStringCharMap[256]; + std::vector symbols; + + /*! checks if a character is a separator */ + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + + /*! checks if a character is a number */ + __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } + + /*! checks if a character is valid inside a string */ + __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } + + /*! checks if a character is legal for an identifier */ + __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } + __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } + }; +} diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h new file mode 100644 index 0000000000..e8698ac56d --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/AVX2NEON.h @@ -0,0 +1,986 @@ +#pragma once + +#include "SSE2NEON.h" + + +#define AVX2NEON_ABI static inline __attribute__((always_inline)) + + +struct __m256d; + +struct __m256 { + __m128 lo,hi; + __m256() {} +}; + + + + +struct __m256i { + __m128i lo,hi; + explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {} + operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;} + __m256i() {} +}; + + + + +struct __m256d { + float64x2_t lo,hi; + __m256d() {} + __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} + __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} +}; + +#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;} + + +#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;} +#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;} + +#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;} + + +#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;} + + + +#define _mm_stream_load_si128 _mm_load_si128 +#define _mm256_stream_load_si256 _mm256_load_si256 + + +AVX2NEON_ABI +__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) +{ + __m128 res; + for (int i=0;i<4;i++) + { + if (imm8 & (1< +AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b) +{ + float v; + v = 0; + v += (code & 0x10) ? a[0]*b[0] : 0; + v += (code & 0x20) ? a[1]*b[1] : 0; + v += (code & 0x40) ? a[2]*b[2] : 0; + v += (code & 0x80) ? a[3]*b[3] : 0; + float32x4_t res; + res[0] = (code & 0x1) ? v : 0; + res[1] = (code & 0x2) ? v : 0; + res[2] = (code & 0x4) ? v : 0; + res[3] = (code & 0x8) ? v : 0; + return res; +} + +template<> +inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + m[3] = 0; + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +template<> +inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +#define _mm_dp_ps(a,b,c) dpps_neon((a),(b)) + + + +AVX2NEON_ABI +__m128 _mm_cmpnge_ps (__m128 a, __m128 b) +{ + return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b)))); +} + + +AVX2NEON_ABI +__m128 _mm_permutevar_ps (__m128 a, __m128i b) +{ + __m128 x; + for (int i=0;i<4;i++) + { + x[i] = a[b[i&3]]; + } + return x; +} + +AVX2NEON_ABI +__m256i _mm256_setzero_si256() +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_setzero_ps() +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(0.0f); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_undefined_si256() +{ + return _mm256_setzero_si256(); +} + +AVX2NEON_ABI +__m256 _mm256_undefined_ps() +{ + return _mm256_setzero_ps(); +} + +CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t) +CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i) +CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128) +CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128) +CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t) +CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i) + + + + +AVX2NEON_ABI +__m128 _mm256_castps256_ps128 (__m256 a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256i _mm256_castsi128_si256 (__m128i a) +{ + __m256i res; + res.lo = a ; + res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m128i _mm256_castsi256_si128 (__m256i a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_castps128_ps256 (__m128 a) +{ + __m256 res; + res.lo = a; + res.hi = vdupq_n_f32(0); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_broadcast_ss (float const * mem_addr) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(*mem_addr); + return res; +} + + + +AVX2NEON_ABI +__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) +{ + __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_set1_epi32 (int a) +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(a); + return res; +} + + + + +AVX2NEON_ABI +int _mm256_movemask_ps(const __m256& v) +{ + return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo); +} + +template +AVX2NEON_ABI +__m256 __mm256_permute_ps (const __m256& a) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8); + return res; + +} + +#define _mm256_permute_ps(a,c) __mm256_permute_ps(a) + + +template +AVX2NEON_ABI +__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8); + return res; + +} + +#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps(a,b) + +AVX2NEON_ABI +__m256i _mm256_set1_epi64x (long long a) +{ + __m256i res; + int64x2_t t = vdupq_n_s64(a); + res.lo = res.hi = __m128i(t); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) +{ + __m256 res; + __m128 tmp; + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + + + res.lo = tmp; + imm8 >>= 4; + + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + res.hi = tmp; + + return res; +} + +AVX2NEON_ABI +__m256 _mm256_moveldup_ps (__m256 a) +{ + __m256 res; + res.lo[0] = res.lo[1] = a.lo[0]; + res.lo[2] = res.lo[3] = a.lo[2]; + res.hi[0] = res.hi[1] = a.hi[0]; + res.hi[2] = res.hi[3] = a.hi[2]; + return res; + +} + +AVX2NEON_ABI +__m256 _mm256_movehdup_ps (__m256 a) +{ + __m256 res; + res.lo[0] = res.lo[1] = a.lo[1]; + res.lo[2] = res.lo[3] = a.lo[3]; + res.hi[0] = res.hi[1] = a.hi[1]; + res.hi[2] = res.hi[3] = a.hi[3]; + return res; +} + +AVX2NEON_ABI +__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) +{ + __m256 res = a; + if (imm8 & 1) res.hi = b; + else res.lo = b; + return res; +} + + +AVX2NEON_ABI +__m128 _mm256_extractf128_ps (__m256 a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + + +AVX2NEON_ABI +__m256d _mm256_movedup_pd (__m256d a) +{ + __m256d res; + res.hi = a.hi; + res.lo[0] = res.lo[1] = a.lo[0]; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_abs_epi32(__m256i a) +{ + __m256i res; + res.lo = vabsq_s32(a.lo); + res.hi = vabsq_s32(a.hi); + return res; +} + +UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps) +UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32) +UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32) + + +BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32) +BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32) +BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32) + +BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32) +BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32) +BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t) +BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t) + +BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps) +BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps) + +BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps) +BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps) +BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps) +BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps) + +BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps) +BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps) +BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps) +BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps) + +BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t) + + + +BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128) +BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128) +BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128) + + +BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps) +BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps) +TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps) + + +TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps) + + +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32) +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32) + + +BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32) +BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32) +BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps) +BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps) +BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps) +BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps) + + +AVX2NEON_ABI +__m256i _mm256_cvtps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvtps_epi32(a.lo); + res.hi = _mm_cvtps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_cvttps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvttps_epi32(a.lo); + res.hi = _mm_cvttps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256 _mm256_loadu_ps (float const * mem_addr) +{ + __m256 res; + res.lo = *(__m128 *)(mem_addr + 0); + res.hi = *(__m128 *)(mem_addr + 4); + return res; +} +#define _mm256_load_ps _mm256_loadu_ps + + +AVX2NEON_ABI +int _mm256_testz_ps (const __m256& a, const __m256& b) +{ + __m256 t = a; + if (&a != &b) + t = _mm256_and_ps(a,b); + + __m128i l = vshrq_n_s32(__m128i(t.lo),31); + __m128i h = vshrq_n_s32(__m128i(t.hi),31); + return vaddvq_s32(vaddq_s32(l,h)) == 0; +} + + +AVX2NEON_ABI +__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) +{ + __m256i res; + int64x2_t t0 = {e0,e1}; + int64x2_t t1 = {e2,e3}; + res.lo = __m128i(t0); + res.hi = __m128i(t1); + return res; +} + +AVX2NEON_ABI +__m256d _mm256_setzero_pd () +{ + __m256d res; + res.lo = res.hi = vdupq_n_f64(0); + return res; +} + +AVX2NEON_ABI +int _mm256_movemask_pd (__m256d a) +{ + int res = 0; + uint64x2_t x; + x = uint64x2_t(a.lo); + res |= (x[0] >> 63) ? 1 : 0; + res |= (x[0] >> 63) ? 2 : 0; + x = uint64x2_t(a.hi); + res |= (x[0] >> 63) ? 4 : 0; + res |= (x[0] >> 63) ? 8 : 0; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) +{ + __m256i res; + res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo))); + res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi))); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cmpeq_pd (__m256d a, __m256d b) +{ + __m256i res; + res.lo = __m128i(vceqq_f64(a.lo,b.lo)); + res.hi = __m128i(vceqq_f64(a.hi,b.hi)); + return res; +} + + +AVX2NEON_ABI +int _mm256_testz_pd (const __m256d& a, const __m256d& b) +{ + __m256d t = a; + + if (&a != &b) + t = _mm256_and_pd(a,b); + + return _mm256_movemask_pd(t) == 0; +} + +AVX2NEON_ABI +__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) +{ + __m256d res; + uint64x2_t t = uint64x2_t(mask.lo); + res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0]; + res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1]; + t = uint64x2_t(mask.hi); + res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0]; + res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1]; + return res; +} + +template +__m256 __mm256_dp_ps (__m256 a, __m256 b) +{ + __m256 res; + res.lo = _mm_dp_ps(a.lo,b.lo,imm8); + res.hi = _mm_dp_ps(a.hi,b.hi,imm8); + return res; +} + +#define _mm256_dp_ps(a,b,c) __mm256_dp_ps(a,b) + +AVX2NEON_ABI +double _mm256_permute4x64_pd_select(__m256d a, const int imm8) +{ + switch (imm8 & 3) { + case 0: + return a.lo[0]; + case 1: + return a.lo[1]; + case 2: + return a.hi[0]; + case 3: + return a.hi[1]; + } + __builtin_unreachable(); + return 0; +} + +AVX2NEON_ABI +__m256d _mm256_permute4x64_pd (__m256d a, const int imm8) +{ + __m256d res; + res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0); + res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2); + res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4); + res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6); + + return res; +} + +AVX2NEON_ABI +__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) +{ + return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8)); +} + + +AVX2NEON_ABI +__m256i _mm256_loadu_si256 (__m256i const * mem_addr) +{ + __m256i res; + res.lo = *(__m128i *)((int32_t *)mem_addr + 0); + res.hi = *(__m128i *)((int32_t *)mem_addr + 4); + return res; +} + +#define _mm256_load_si256 _mm256_loadu_si256 + +AVX2NEON_ABI +void _mm256_storeu_ps (float * mem_addr, __m256 a) +{ + *(__m128 *)(mem_addr + 0) = a.lo; + *(__m128 *)(mem_addr + 4) = a.hi; + +} + +#define _mm256_store_ps _mm256_storeu_ps +#define _mm256_stream_ps _mm256_storeu_ps + + +AVX2NEON_ABI +void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) +{ + *(__m128i *)((int *)mem_addr + 0) = a.lo; + *(__m128i *)((int *)mem_addr + 4) = a.hi; + +} + +#define _mm256_store_si256 _mm256_storeu_si256 + + + +AVX2NEON_ABI +__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) +{ + __m256 res; + res.lo = _mm_maskload_ps(mem_addr,mask.lo); + res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepu8_epi32 (__m128i a) +{ + __m256i res; + uint8x16_t x = uint8x16_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepi8_epi32 (__m128i a) +{ + __m256i res; + int8x16_t x = int8x16_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepu16_epi32 (__m128i a) +{ + __m256i res; + uint16x8_t x = uint16x8_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cvtepi16_epi32 (__m128i a) +{ + __m256i res; + int16x8_t x = int16x8_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + + +AVX2NEON_ABI +void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) +{ + _mm_maskstore_epi32(mem_addr,mask.lo,a.lo); + _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi); +} + +AVX2NEON_ABI +__m256i _mm256_slli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_slli_epi32(a.lo,imm8); + res.hi = _mm_slli_epi32(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_srli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srli_epi32(a.lo,imm8); + res.hi = _mm_srli_epi32(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_srai_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srai_epi32(a.lo,imm8); + res.hi = _mm_srai_epi32(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,count.lo); + res.hi = vshlq_s32(a.hi,count.hi); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_srav_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo)); + res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi)); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo))); + res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi))); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) +{ + return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); +} + + +AVX2NEON_ABI +__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_set1_ps(float x) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(x); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) +{ + __m256 res; + res.lo = _mm_set_ps(e3,e2,e1,e0); + res.hi = _mm_set_ps(e7,e6,e5,e4); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_broadcast_ps (__m128 const * mem_addr) +{ + __m256 res; + res.lo = res.hi = *mem_addr; + return res; +} + +AVX2NEON_ABI +__m256 _mm256_cvtepi32_ps (__m256i a) +{ + __m256 res; + res.lo = _mm_cvtepi32_ps(a.lo); + res.hi = _mm_cvtepi32_ps(a.hi); + return res; +} +AVX2NEON_ABI +void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) +{ + for (int i=0;i<4;i++) { + if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i]; + if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i]; + } +} + +AVX2NEON_ABI +__m256d _mm256_andnot_pd (__m256d a, __m256d b) +{ + __m256d res; + res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo))); + res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi))); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) +{ + __m256 res; + res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf); + res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) +{ + __m256i res; + res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf); + res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +{ + __m256i res; + for (int i=0;i<4;i++) + { + res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); + res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) +{ + __m256i res = _mm256_setzero_si256(); + for (int i=0;i<4;i++) + { + if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); + if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); + } + + return res; + +} + + diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h new file mode 100644 index 0000000000..2013151d31 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/SSE2NEON.h @@ -0,0 +1,1753 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding ARM NEON versions +// +// This header file does not (yet) translate *all* of the SSE intrinsics. +// Since this is in support of a specific porting effort, I have only +// included the intrinsics I needed to get my port to work. +// +// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com +// +// If you want to improve or add to this project, send me an +// email and I will probably approve your access to the depot. +// +// Project is located here: +// +// https://github.com/jratcliff63367/sse2neon +// +// Show your appreciation for open source by sending me a bitcoin tip to the following +// address. +// +// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p : +// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p +// +// +// Contributors to this project are: +// +// John W. Ratcliff : jratcliffscarab@gmail.com +// Brandon Rowlett : browlett@nvidia.com +// Ken Fast : kfast@gdeb.com +// Eric van Beurden : evanbeurden@nvidia.com +// +// +// ********************************************************************************************************************* +// Release notes for January 20, 2017 version: +// +// The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition +// The unit-tests now test 10,000 random float and int values against each intrinsic. +// +// SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and +// fully tested on NEON/ARM. The remaining 56 still need unit tests implemented. +// +// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which +// attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128 +// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer +// can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot +// cast or otherwise alias the base NEON data type for intrinsic operations. +// +// A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with +// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing +// to return the correct value. This is now fixed. +// +// A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers. +// It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int +// is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does. +// As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get +// a build error. +// +// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are +// producing the correct results on NEON. These unit tests will be added as soon as possible. +// +// Here is the list of new instrinsics which have been added: +// +// _mm_cvtss_f32 : extracts the lower order floating point value from the parameter +// _mm_add_ss : adds the scalar single - precision floating point values of a and b +// _mm_div_ps : Divides the four single - precision, floating - point values of a and b. +// _mm_div_ss : Divides the scalar single - precision floating point value of a by b. +// _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in. +// _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in. +// _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation +// _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation. +// _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation. +// _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation. +// _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation. +// _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation +// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b. +// _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b. +// +// ********************************************************************************************************************* +/* +** The MIT license: +** +** Permission is hereby granted, free of charge, to any person obtaining a copy +** of this software and associated documentation files (the "Software"), to deal +** in the Software without restriction, including without limitation the rights +** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +** copies of the Software, and to permit persons to whom the Software is furnished +** to do so, subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. + +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#pragma once + +#define GCC 1 +#define ENABLE_CPP_VERSION 0 + +// enable precise emulation of _mm_min_ps and _mm_max_ps? +// This would slow down the computation a bit, but gives consistent result with x86 SSE2. +// (e.g. would solve a hole or NaN pixel in the rendering result) +#define USE_PRECISE_MINMAX_IMPLEMENTATION (1) + +#if GCC +#define FORCE_INLINE inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#else +#define FORCE_INLINE inline +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif + +#include +#include "arm_neon.h" +#if defined(__aarch64__) +#include "constants.h" +#endif + + +#if !defined(__has_builtin) +#define __has_builtin(x) (0) +#endif + +/*******************************************************/ +/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ +/* Argument fp3 is a digit[0123] that represents the fp*/ +/* from argument "b" of mm_shuffle_ps that will be */ +/* placed in fp3 of result. fp2 is the same for fp2 in */ +/* result. fp1 is a digit[0123] that represents the fp */ +/* from argument "a" of mm_shuffle_ps that will be */ +/* places in fp1 of result. fp0 is the same for fp0 of */ +/* result */ +/*******************************************************/ +#if defined(__aarch64__) +#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) +#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } ) +#endif + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \ + ((fp1) << 2) | ((fp0))) + +typedef float32x4_t __m128; +typedef int32x4_t __m128i; + +// union intended to allow direct access to an __m128 variable using the names that the MSVC +// compiler provides. This union should really only be used when trying to access the members +// of the vector as integer values. GCC/clang allow native access to the float members through +// a simple array access operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance +// hit. If it really is needed however, the original __m128 variable can be aliased with a +// pointer to this union and used to access individual components. The use of this union should +// be hidden behind a macro that is used throughout the codebase to access the members instead +// of always declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec +{ + float m128_f32[4]; // as floats - do not to use this. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. + double m128_f64[2]; // as signed double +} SIMDVec; + +// ****************************************** +// CPU stuff +// ****************************************** + +typedef SIMDVec __m128d; + +#include + +#ifndef _MM_MASK_MASK +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_DIV_ZERO 0x200 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_DENORMALS_ZERO_ON 0x40 +#define _MM_MASK_DENORM 0x100 +#endif +#define _MM_SET_EXCEPTION_MASK(x) +#define _MM_SET_FLUSH_ZERO_MODE(x) +#define _MM_SET_DENORMALS_ZERO_MODE(x) + +FORCE_INLINE void _mm_pause() +{ +} + +FORCE_INLINE void _mm_mfence() +{ + __sync_synchronize(); +} + +#define _MM_HINT_T0 3 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 1 +#define _MM_HINT_NTA 0 + +FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level) +{ + __builtin_prefetch(ptr); + +} + +FORCE_INLINE void* _mm_malloc(int size, int align) +{ + void *ptr; + // align must be multiple of sizeof(void *) for posix_memalign. + if (align < sizeof(void *)) { + align = sizeof(void *); + } + + if ((align % sizeof(void *)) != 0) { + // fallback to malloc + ptr = malloc(size); + } else { + if (posix_memalign(&ptr, align, size)) { + return 0; + } + } + + return ptr; +} + +FORCE_INLINE void _mm_free(void* ptr) +{ + free(ptr); +} + +FORCE_INLINE int _mm_getcsr() +{ + return 0; +} + +FORCE_INLINE void _mm_setcsr(int val) +{ + return; +} + +// ****************************************** +// Set/get methods +// ****************************************** + +// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396 +#if defined(__aarch64__) +FORCE_INLINE float _mm_cvtss_f32(const __m128& x) +{ + return x[0]; +} +#else +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(a, 0); +} +#endif + +// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128() +{ + return vdupq_n_s32(0); +} + +// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vdupq_n_f32(0); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vdupq_n_f32(_w); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vdupq_n_f32(_w); +} + +// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x) +{ + float32x4_t t = { x, y, z, w }; + return t; +} + +// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x ) +{ + float32x4_t t = { w, z, y, x }; + return t; +} +#else +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float __attribute__((aligned(16))) data[4] = { x, y, z, w }; + return vld1q_f32(data); +} + +// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) +{ + float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; + return vld1q_f32(data); +} +#endif + +// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vdupq_n_s32(_i); +} + +//Set the first lane to of 4 signed single-position, floating-point number to w +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_set_ss(float _w) +{ + float32x4_t res = {_w, 0, 0, 0}; + return res; +} + +// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32x4_t t = {i0,i1,i2,i3}; + return t; +} +#else +FORCE_INLINE __m128 _mm_set_ss(float _w) +{ + __m128 val = _mm_setzero_ps(); + return vsetq_lane_f32(_w, val, 0); +} + +// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 }; + return vld1q_s32(data); +} +#endif + +// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, a); +} + +// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, a); +} + +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t*) p,a); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a ) +{ + vst1q_s32((int32_t*) p,a); +} + +// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, a, 0); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b) +{ + *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0); +} + +// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float * p) +{ + return vld1q_dup_f32(p); +} + +// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float * p) +{ + return vld1q_f32(p); +} + +// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float * p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon + return vld1q_f32(p); +} + +// Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float * p) +{ + __m128 result = vdupq_n_f32(0); + return vsetq_lane_f32(*p, result, 0); +} + +FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p) +{ + return (__m128i)vld1q_s32((const int32_t*) p); +} + + +// ****************************************** +// Logic/Binary operations +// ****************************************** + +// Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b)); +} + +// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return (__m128i)vandq_s32(a, b); +} + +// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return (__m128)vandq_s32((__m128i)a, (__m128i)b); +} + +// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return (__m128)vorrq_s32((__m128i)a, (__m128i)b); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return (__m128)veorq_s32((__m128i)a, (__m128i)b); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return (__m128i)vorrq_s32(a, b); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return veorq_s32(a, b); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ +#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this + uint32x4_t &ia = *(uint32x4_t *)&a; + return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8); +#else + +#if defined(__aarch64__) + uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); + return vaddvq_u32(t2); +#else + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t0 = vreinterpretq_u32_f32(a); + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +#endif + +#endif +} + +#if defined(__aarch64__) +FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a) +{ + uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); + t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2))); + return vaddvq_u32(t2); + +} +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + return vcombine_f32(vget_high_f32(a), vget_low_f32(b)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high end of result +// takes the higher two 32 bit values from b and swaps them and places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b))); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + return vcombine_f32(vget_low_f32(a), vget_high_f32(b)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(a, 0); + float32_t a2 = vgetq_lane_f32(a, 2); + float32x2_t aVal = vdup_n_f32(a2); + aVal = vset_lane_f32(a0, aVal, 1); + return vcombine_f32(aVal, vget_high_f32(b)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vget_low_f32(a), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vget_high_f32(a), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3)); + float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3)); + return vcombine_f32(a21, b03); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3)); + float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3)); + return vcombine_f32(a03, b21); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(a); + float32x2_t b10 = vget_low_f32(b); + return vcombine_f32(a10, b10); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(a)); + float32x2_t b10 = vget_low_f32(b); + return vcombine_f32(a01, b10); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(a)); + float32x2_t b01 = vrev64_f32(vget_low_f32(b)); + return vcombine_f32(a01, b01); +} + +// NEON does not support a general purpose permute intrinsic +// Currently I am not sure whether the C implementation is faster or slower than the NEON version. +// Note, this has to be expanded as a template because the shuffle value must be an immediate value. +// The same is true on SSE as well. +// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +template +FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b) +{ +#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet. + __m128 ret; + ret[0] = a[i & 0x3]; + ret[1] = a[(i >> 2) & 0x3]; + ret[2] = b[(i >> 4) & 0x03]; + ret[3] = b[(i >> 6) & 0x03]; + return ret; +#else +# if __has_builtin(__builtin_shufflevector) + return __builtin_shufflevector( \ + a, b, (i) & (0x3), ((i) >> 2) & 0x3, + (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4); +# else + const int i0 = (i >> 0)&0x3; + const int i1 = (i >> 2)&0x3; + const int i2 = (i >> 4)&0x3; + const int i3 = (i >> 6)&0x3; + + if (&a == &b) + { + if (i0 == i1 && i0 == i2 && i0 == i3) + { + return (float32x4_t)vdupq_laneq_f32(a,i0); + } + static const uint8_t tbl[16] = { + (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, + (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, + (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3, + (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3 + }; + + return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl); + + } + else + { + + static const uint8_t tbl[16] = { + (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, + (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, + (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16, + (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16 + }; + + return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl)); + } +# endif //builtin(shufflevector) +#endif +} + +template +FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b) +{ + switch (i) + { + case _MM_SHUFFLE(1, 0, 3, 2): + return _mm_shuffle_ps_1032(a, b); + break; + case _MM_SHUFFLE(2, 3, 0, 1): + return _mm_shuffle_ps_2301(a, b); + break; + case _MM_SHUFFLE(3, 2, 1, 0): + return _mm_shuffle_ps_3210(a, b); + break; + case _MM_SHUFFLE(0, 0, 1, 1): + return _mm_shuffle_ps_0011(a, b); + break; + case _MM_SHUFFLE(0, 0, 2, 2): + return _mm_shuffle_ps_0022(a, b); + break; + case _MM_SHUFFLE(2, 2, 0, 0): + return _mm_shuffle_ps_2200(a, b); + break; + case _MM_SHUFFLE(3, 2, 0, 2): + return _mm_shuffle_ps_3202(a, b); + break; + case _MM_SHUFFLE(1, 1, 3, 3): + return _mm_shuffle_ps_1133(a, b); + break; + case _MM_SHUFFLE(2, 0, 1, 0): + return _mm_shuffle_ps_2010(a, b); + break; + case _MM_SHUFFLE(2, 0, 0, 1): + return _mm_shuffle_ps_2001(a, b); + break; + case _MM_SHUFFLE(2, 0, 3, 2): + return _mm_shuffle_ps_2032(a, b); + break; + case _MM_SHUFFLE(0, 3, 2, 1): + return _mm_shuffle_ps_0321(a, b); + break; + case _MM_SHUFFLE(2, 1, 0, 3): + return _mm_shuffle_ps_2103(a, b); + break; + case _MM_SHUFFLE(1, 0, 1, 0): + return _mm_shuffle_ps_1010(a, b); + break; + case _MM_SHUFFLE(1, 0, 0, 1): + return _mm_shuffle_ps_1001(a, b); + break; + case _MM_SHUFFLE(0, 1, 0, 1): + return _mm_shuffle_ps_0101(a, b); + break; + } + return _mm_shuffle_ps_default(a, b); +} + +# if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default(a,b) +# else +#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function(a,b) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b) +{ + return vcombine_s32(vget_high_s32(a), vget_low_s32(b)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end of result +// takes the higher two 32 bit values from b and swaps them and places in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b))); +} + +// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b +// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b) +{ + return vextq_s32(a, b, 1); +} + +// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a +// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b) +{ + return vextq_s32(a, b, 3); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of b and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b) +{ + return vcombine_s32(vget_low_s32(a), vget_low_s32(a)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits +// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b) +{ + return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b) +{ + return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b) +{ + return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3))); +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b) +{ +#if ENABLE_CPP_VERSION + __m128i ret; + ret[0] = a[i & 0x3]; + ret[1] = a[(i >> 2) & 0x3]; + ret[2] = b[(i >> 4) & 0x03]; + ret[3] = b[(i >> 6) & 0x03]; + return ret; +#else + __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3)); + ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1); + ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2); + ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3); + return ret; +#endif +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b) +{ + switch (i) + { + case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break; + case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break; + case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break; + case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break; + case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break; + case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break; + case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break; + case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break; + case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break; + case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break; + default: return _mm_shuffle_epi32_default(a, b); + } +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a) +{ + return vdupq_n_s32(vgetq_lane_s32(a, i)); +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a) +{ + switch (i) + { + case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break; + case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break; + case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break; + case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break; + default: return _mm_shuffle_epi32_function(a, a); + } +} + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single(a) + +template +FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a) +{ + int16x8_t ret = (int16x8_t)a; + int16x4_t highBits = vget_high_s16(ret); + ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7); + return (__m128i)ret; +} + +// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function(a) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int32x4_t s = vdupq_n_s32(imm8); + return vshlq_s32(a, s); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + const int s = (imm8 > 31) ? 0 : imm8; + data[0] = data[0] << s; + data[1] = data[1] << s; + data[2] = data[2] << s; + data[3] = data[3] << s; + + return vld1q_s32(data); +#endif +} + + +//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx +//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int shift = (imm8 > 31) ? 0 : imm8; // Unfortunately, we need to check for this case for embree. + const int32x4_t s = vdupq_n_s32(-shift); + return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s)); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + + const int s = (imm8 > 31) ? 0 : imm8; + + data[0] = data[0] >> s; + data[1] = data[1] >> s; + data[2] = data[2] >> s; + data[3] = data[3] >> s; + + return vld1q_s32(data); +#endif +} + + +// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx +//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int32x4_t s = vdupq_n_s32(-imm8); + return vshlq_s32(a, s); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + const uint32_t m = (uint32_t) ((~0U) << (32 - imm8)); + + for (int i = 0; i < 4; i++) { + uint32_t is_neg = ((uint32_t) (((data[i]) >> 31))); + data[i] = (data[i] >> imm8) | (m * is_neg); + } + + return vld1q_s32(data); +#endif +} + +// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm)) +#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm)) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm)) + +// NEON does not provide a version of this function, here is an article about some ways to repro the results. +// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon +// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i _a) +{ + uint8x16_t input = (uint8x16_t)_a; + const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ((hi[0] << 8) | (lo[0] & 0xFF)); +} + + +// ****************************************** +// Math operations +// ****************************************** + +// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vsubq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return vsubq_f32(a, b); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vsubq_s32(a, b); +} + +// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vaddq_f32(a, b); +} + +// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + const float32_t b0 = vgetq_lane_f32(b, 0); + float32x4_t value = vdupq_n_f32(0); + + //the upper values in the result must be the remnants of . + value = vsetq_lane_f32(b0, value, 0); + return vaddq_f32(a, value); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vaddq_s32(a, b); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b) +{ + return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b); +} + +// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vmulq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return vmulq_f32(a, b); +} + +// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ +#if defined(BUILD_IOS) + return vdivq_f32(vdupq_n_f32(1.0f),in); + +#endif + // Get an initial estimate of 1/in. + float32x4_t reciprocal = vrecpeq_f32(in); + + // We only return estimated 1/in. + // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps(). + + // TODO(LTE): We could delete these ifdef? + reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); + return reciprocal; + +} + +FORCE_INLINE __m128 _mm_rcp_ss(__m128 in) +{ + float32x4_t value; + float32x4_t result = in; + + value = _mm_rcp_ps(in); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(BUILD_IOS) + return vdivq_f32(a,b); +#else + float32x4_t reciprocal = _mm_rcp_ps(b); + + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp. + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + // Another round for safety + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + + return vmulq_f32(a, reciprocal); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + value = _mm_div_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + + float32x4_t value = vrsqrteq_f32(in); + + // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values. + // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html + // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps + + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + // one more round to get better precision + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + // another round for safety + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + return value; +} + +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + float32x4_t result = in; + + __m128 value = _mm_rsqrt_ps(in); + + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + + +// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if defined(BUILD_IOS) + return vsqrtq_f32(in); +#else + __m128 reciprocal = _mm_rsqrt_ps(in); + + // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf. + // We assign 0 to reciprocal wherever required. + const float32x4_t vzero = vdupq_n_f32(0.0f); + const uint32x4_t mask = vceqq_f32(in, vzero); + reciprocal = vbslq_f32(mask, vzero, reciprocal); + + // sqrt(x) = x * (1 / sqrt(x)) + return vmulq_f32(in, reciprocal); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32x4_t value; + float32x4_t result = in; + + value = _mm_sqrt_ps(in); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + + +// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if USE_PRECISE_MINMAX_IMPLEMENTATION + return vbslq_f32(vcltq_f32(b,a),a,b); +#else + // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + return vmaxq_f32(a, b); +#endif +} + +// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if USE_PRECISE_MINMAX_IMPLEMENTATION + return vbslq_f32(vcltq_f32(a,b),a,b); +#else + // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + return vminq_f32(a, b); +#endif +} + +// Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + + value = _mm_max_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + + + value = _mm_min_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b ) +{ + return vmaxq_s32(a,b); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b ) +{ + return vminq_s32(a,b); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b); + ret = vshrq_n_s16(ret, 1); + return (__m128i)ret; +} + +// Computes pairwise add of each argument as single-precision, floating-point values a and b. +//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) +{ +#if defined(__aarch64__) + return vpaddq_f32(a,b); +#else +// This does not work, no vpaddq... +// return (__m128) vpaddq_f32(a,b); + // + // get two f32x2_t values from a + // do vpadd + // put result in low half of f32x4 result + // + // get two f32x2_t values from b + // do vpadd + // put result in high half of f32x4 result + // + // combine + return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) ); +#endif +} + +// ****************************************** +// Compare operations +// ****************************************** + +// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return (__m128)vcltq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b)); +} + +// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return (__m128)vcgtq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return (__m128) _mm_cmpgt_ps(a,b); +} + + +// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return (__m128)vcgeq_f32(a, b); +} + +// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return (__m128)vcleq_f32(a, b); +} + +// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return (__m128)vceqq_f32(a, b); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcltq_s32(a, b); +} + +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return (__m128i) vceqq_s32(a,b); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcgtq_s32(a, b); +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx +// see also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) ); +} + +// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcltq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcgtq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcleq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcgeq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vceqq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vceqq_f32(a, b); + return !vgetq_lane_u32(value, 0); +} + +// according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here. +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +// ****************************************** +// Conversions +// ****************************************** + +// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vcvtq_s32_f32(a); +} + +// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vcvtq_f32_s32(a); +} + +// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support! +// It is supported on ARMv8 however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if 1 + return vcvtnq_s32_f32(a); +#else + __m128 half = vdupq_n_f32(0.5f); + const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); + const __m128 aPlusHalf = vaddq_f32(a, half); + const __m128 aRound = vsubq_f32(aPlusHalf, sign); + return vcvtq_s32_f32(aRound); +#endif +} + +// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(a, 0); +} + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = vdupq_n_s32(0); + return vsetq_lane_s32(a, result, 0); +} + + +// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ +#if defined(__aarch64__) + return (__m128i)a; +#else + return *(const __m128i *)&a; +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ +#if defined(__aarch64__) + return (__m128)a; +#else + return *(const __m128 *)&a; +#endif +} + +// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vld1q_s32((int32_t *)p); +} + +FORCE_INLINE __m128d _mm_castps_pd(const __m128 a) +{ + return *(const __m128d *)&a; +} + +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ + return *(const __m128d *)&a; +} +// ****************************************** +// Miscellaneous Operations +// ****************************************** + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b)); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ + int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a); + int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b); + + int8x8x2_t result = vzip_s8(a1, b1); + + return (__m128i)vcombine_s8(result.val[0], result.val[1]); +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ + int16x4_t a1 = vget_low_s16((int16x8_t)a); + int16x4_t b1 = vget_low_s16((int16x8_t)b); + + int16x4x2_t result = vzip_s16(a1, b1); + + return (__m128i)vcombine_s16(result.val[0], result.val[1]); +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ + int32x2_t a1 = vget_low_s32(a); + int32x2_t b1 = vget_low_s32(b); + + int32x2x2_t result = vzip_s32(a1, b1); + + return vcombine_s32(result.val[0], result.val[1]); +} + +// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ + float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b)); + return vcombine_f32(result.val[0], result.val[1]); +} + +// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ + float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b)); + return vcombine_f32(result.val[0], result.val[1]); +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ + int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a); + int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b); + + int8x8x2_t result = vzip_s8(a1, b1); + + return (__m128i)vcombine_s8(result.val[0], result.val[1]); +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ + int16x4_t a1 = vget_high_s16((int16x8_t)a); + int16x4_t b1 = vget_high_s16((int16x8_t)b); + + int16x4x2_t result = vzip_s16(a1, b1); + + return (__m128i)vcombine_s16(result.val[0], result.val[1]); +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ + int32x2_t a1 = vget_high_s32(a); + int32x2_t b1 = vget_high_s32(b); + + int32x2x2_t result = vzip_s32(a1, b1); + + return vcombine_s32(result.val[0], result.val[1]); +} + +// Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm) + +// ****************************************** +// Streaming Extensions +// ****************************************** + +// Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ + *p = a; +} + +// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const*p) +{ + // no corollary for Neon? +} + +FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b) +{ + // Stick to the flipped behavior of x86. + int64_t __attribute__((aligned(16))) data[2] = { b, a }; + return (__m128i)vld1q_s64(data); +} + +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return (__m128i)vmovq_n_s64(_i); +} + +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) +{ + int32x4_t mask = vshrq_n_s32(__m128i(c),31); + return vbslq_f32( uint32x4_t(mask), b, a); +} + +FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr) +{ + uint16x8_t t0 = vld1q_u16((uint16_t*)ptr); + uint32x4_t t1 = vmovl_u16(vget_low_u16(t0)); + return vreinterpretq_s32_u32(t1); +} + +FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr) +{ + int8x8_t t0 = vld1_s8((int8_t*)ptr); + int16x8_t t1 = vmovl_s8(t0); + int32x4_t t2 = vmovl_s16(vget_low_s16(t1)); + float32x4_t t3 = vcvtq_f32_s32(t2); + return vreinterpretq_s32_f32(t3); +} + +FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr) +{ + int16x8_t t0 = vld1q_s16((int16_t*)ptr); + int32x4_t t1 = vmovl_s16(vget_low_s16(t0)); + float32x4_t t2 = vcvtq_f32_s32(t1); + return vreinterpretq_s32_f32(t2); +} + +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); +} + +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr) +{ + // No non-temporal load on a single register on ARM. + return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr)); +} + +FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a) +{ + // No non-temporal store on a single register on ARM. + vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a)); +} + +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); +} + +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); +} + +FORCE_INLINE __m128 _mm_abs_ps(__m128 a) +{ + return vabsq_f32(a); +} + +FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c) +{ + return vmlaq_f32(c, a, b); +} + +FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c) +{ + return vmlsq_f32(c, a, b); +} + +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vabsq_s32(a); +} +#endif //defined(__aarch64__) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ + return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a))); +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ + return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a))); +} + +#endif diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h new file mode 100644 index 0000000000..32452fbe72 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/affinespace.h @@ -0,0 +1,361 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linearspace2.h" +#include "linearspace3.h" +#include "quaternion.h" +#include "bbox.h" +#include "vec4.h" + +namespace embree +{ + #define VectorT typename L::Vector + #define ScalarT typename L::Vector::Scalar + + //////////////////////////////////////////////////////////////////////////////// + // Affine Space + //////////////////////////////////////////////////////////////////////////////// + + template + struct AffineSpaceT + { + L l; /*< linear part of affine space */ + VectorT p; /*< affine part of affine space */ + + //////////////////////////////////////////////////////////////////////////////// + // Constructors, Assignment, Cast, Copy Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT ( ) { } + __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } + __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } + __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } + + __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} + __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} + + template __forceinline AffineSpaceT( const AffineSpaceT& s ) : l(s.l), p(s.p) {} + + //////////////////////////////////////////////////////////////////////////////// + // Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} + __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} + + /*! return matrix for scaling */ + static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } + + /*! return matrix for translation */ + static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } + + /*! return matrix for rotation, only in 2D */ + static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } + + /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ + static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } + + /*! return matrix for rotation around arbitrary axis and point, only in 3D */ + static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } + + /*! return matrix for looking at given point, only in 3D */ + static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { + VectorT Z = normalize(point-eye); + VectorT U = normalize(cross(up,Z)); + VectorT V = normalize(cross(Z,U)); + return AffineSpaceT(L(U,V,Z),eye); + } + + }; + + // template specialization to get correct identity matrix for type AffineSpace3fa + template<> + __forceinline AffineSpaceT::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline AffineSpaceT operator -( const AffineSpaceT& a ) { return AffineSpaceT(-a.l,-a.p); } + template __forceinline AffineSpaceT operator +( const AffineSpaceT& a ) { return AffineSpaceT(+a.l,+a.p); } + template __forceinline AffineSpaceT rcp( const AffineSpaceT& a ) { L il = rcp(a.l); return AffineSpaceT(il,-(il*a.p)); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline const AffineSpaceT operator +( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l+b.l,a.p+b.p); } + template __forceinline const AffineSpaceT operator -( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l-b.l,a.p-b.p); } + + template __forceinline const AffineSpaceT operator *( const ScalarT & a, const AffineSpaceT& b ) { return AffineSpaceT(a*b.l,a*b.p); } + template __forceinline const AffineSpaceT operator *( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l*b.l,a.l*b.p+a.p); } + template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const AffineSpaceT& b ) { return a * rcp(b); } + template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const ScalarT & b ) { return a * rcp(b); } + + template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a * b; } + template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const ScalarT & b ) { return a = a * b; } + template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a / b; } + template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const ScalarT & b ) { return a = a / b; } + + template __forceinline VectorT xfmPoint (const AffineSpaceT& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } + template __forceinline VectorT xfmVector(const AffineSpaceT& m, const VectorT& v) { return xfmVector(m.l,v); } + template __forceinline VectorT xfmNormal(const AffineSpaceT& m, const VectorT& n) { return xfmNormal(m.l,n); } + + __forceinline const BBox xfmBounds(const AffineSpaceT >& m, const BBox& b) + { + BBox3fa dst = empty; + const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); + const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); + const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); + const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); + const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); + const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); + const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); + const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); + return dst; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l == b.l && a.p == b.p; } + template __forceinline bool operator !=( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l != b.l || a.p != b.p; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline AffineSpaceT select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT& t, const AffineSpaceT& f ) { + return AffineSpaceT(select(s,t.l,f.l),select(s,t.p,f.p)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT& m) { + return cout << "{ l = " << m.l << ", p = " << m.p << " }"; + } + + //////////////////////////////////////////////////////////////////////////////// + // Template Instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef AffineSpaceT AffineSpace2f; + typedef AffineSpaceT AffineSpace3f; + typedef AffineSpaceT AffineSpace3fa; + typedef AffineSpaceT AffineSpace3fx; + typedef AffineSpaceT AffineSpace3ff; + typedef AffineSpaceT OrthonormalSpace3f; + + template using AffineSpace3vf = AffineSpaceT>>>; + typedef AffineSpaceT>>> AffineSpace3vf4; + typedef AffineSpaceT>>> AffineSpace3vf8; + typedef AffineSpaceT>>> AffineSpace3vf16; + + template using AffineSpace3vff = AffineSpaceT>>>; + typedef AffineSpaceT>>> AffineSpace3vfa4; + typedef AffineSpaceT>>> AffineSpace3vfa8; + typedef AffineSpaceT>>> AffineSpace3vfa16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template + __forceinline AffineSpaceT lerp(const AffineSpaceT& M0, + const AffineSpaceT& M1, + const R& t) + { + return AffineSpaceT(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); + } + + // slerp interprets the 16 floats of the matrix M = D * R * S as components of + // three matrizes (D, R, S) that are interpolated individually. + template __forceinline AffineSpaceT>> + slerp(const AffineSpaceT>>& M0, + const AffineSpaceT>>& M1, + const T& t) + { + QuaternionT q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + QuaternionT q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + QuaternionT q = slerp(q0, q1, t); + + AffineSpaceT>> S = lerp(M0, M1, t); + AffineSpaceT>> D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpaceT>> R = LinearSpace3>(q); + return D * R * S; + } + + // this is a specialized version for Vec3fa because that does + // not play along nicely with the other templated Vec3/Vec4 types + __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, + const AffineSpace3ff& M1, + const float& t) + { + Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + Quaternion3f q = slerp(q0, q1, t); + + AffineSpace3fa S = lerp(M0, M1, t); + AffineSpace3fa D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * S; + } + + __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) + { + // compute affine transform from quaternion decomposition + Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + AffineSpace3fa M = qd; + AffineSpace3fa D(one); + D.p.x = M.l.vx.y; + D.p.y = M.l.vx.z; + D.p.z = M.l.vy.z; + M.l.vx.y = 0; + M.l.vx.z = 0; + M.l.vy.z = 0; + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * M; + } + + __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) + { + q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + S = qd; + T.x = qd.l.vx.y; + T.y = qd.l.vx.z; + T.z = qd.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + } + + __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) + { + AffineSpace3ff M = S; + M.l.vx.w = q.i; + M.l.vy.w = q.j; + M.l.vz.w = q.k; + M.p.w = q.r; + M.l.vx.y = T.x; + M.l.vx.z = T.y; + M.l.vy.z = T.z; + return M; + } + + struct __aligned(16) QuaternionDecomposition + { + float scale_x = 1.f; + float scale_y = 1.f; + float scale_z = 1.f; + float skew_xy = 0.f; + float skew_xz = 0.f; + float skew_yz = 0.f; + float shift_x = 0.f; + float shift_y = 0.f; + float shift_z = 0.f; + float quaternion_r = 1.f; + float quaternion_i = 0.f; + float quaternion_j = 0.f; + float quaternion_k = 0.f; + float translation_x = 0.f; + float translation_y = 0.f; + float translation_z = 0.f; + }; + + __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) + { + QuaternionDecomposition qd; + qd.scale_x = M.l.vx.x; + qd.scale_y = M.l.vy.y; + qd.scale_z = M.l.vz.z; + qd.shift_x = M.p.x; + qd.shift_y = M.p.y; + qd.shift_z = M.p.z; + qd.translation_x = M.l.vx.y; + qd.translation_y = M.l.vx.z; + qd.translation_z = M.l.vy.z; + qd.skew_xy = M.l.vy.x; + qd.skew_xz = M.l.vz.x; + qd.skew_yz = M.l.vz.y; + qd.quaternion_r = M.p.w; + qd.quaternion_i = M.l.vx.w; + qd.quaternion_j = M.l.vy.w; + qd.quaternion_k = M.l.vz.w; + return qd; + } + + //////////////////////////////////////////////////////////////////////////////// + /* + * ! Template Specialization for 2D: return matrix for rotation around point + * (rotation around arbitrarty vector is not meaningful in 2D) + */ + template<> __forceinline + AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { + return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); + } + + //////////////////////////////////////////////////////////////////////////////// + // Similarity Transform + // + // checks, if M is a similarity transformation, i.e if there exists a factor D + // such that for all x,y: distance(Mx, My) = D * distance(x, y) + //////////////////////////////////////////////////////////////////////////////// + __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) + { + if (D) *D = 0.f; + if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; + if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; + if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; + + const float D_x = dot(M.l.vx, M.l.vx); + const float D_y = dot(M.l.vy, M.l.vy); + const float D_z = dot(M.l.vz, M.l.vz); + + if (abs(D_x - D_y) > 1e-5f || + abs(D_x - D_z) > 1e-5f || + abs(D_y - D_z) > 1e-5f) + return false; + + if (D) *D = sqrtf(D_x); + return true; + } + + __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) + { + Vec3fa::storeu(&ptr->l.vx, source.l.vx); + Vec3fa::storeu(&ptr->l.vy, source.l.vy); + Vec3fa::storeu(&ptr->l.vz, source.l.vz); + Vec3fa::storeu(&ptr->p, source.p); + } + + __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) + { + AffineSpace3fa space; + space.l.vx = Vec3fa::loadu(&ptr->l.vx); + space.l.vy = Vec3fa::loadu(&ptr->l.vy); + space.l.vz = Vec3fa::loadu(&ptr->l.vz); + space.p = Vec3fa::loadu(&ptr->p); + return space; + } + + #undef VectorT + #undef ScalarT +} diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h new file mode 100644 index 0000000000..29bb13912b --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/bbox.h @@ -0,0 +1,331 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" + +namespace embree +{ + namespace internal { + + template __forceinline T divideByTwo(const T& v) { return v / T(2); } + template <> __forceinline float divideByTwo(const float& v) { return v * 0.5f; } + template <> __forceinline double divideByTwo(const double& v) { return v * 0.5; } + + } // namespace internal + template + struct BBox + { + T lower, upper; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox ( ) { } + template + __forceinline BBox ( const BBox& other ) : lower(other.lower), upper(other.upper) {} + __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline BBox ( const T& v ) : lower(v), upper(v) {} + __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Extending Bounds + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + /*! tests if box is empty */ + __forceinline bool empty() const { for (int i=0; i upper[i]) return true; return false; } + + /*! computes the size of the box */ + __forceinline T size() const { return upper - lower; } + + /*! computes the center of the box */ + __forceinline T center() const { return internal::divideByTwo(lower+upper); } + + /*! computes twice the center of the box */ + __forceinline T center2() const { return lower+upper; } + + /*! merges two boxes */ + __forceinline static const BBox merge (const BBox& a, const BBox& b) { + return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); + } + + /*! enlarge box by some scaling factor */ + __forceinline BBox enlarge_by(const float a) const { + return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} + }; + + template<> __forceinline bool BBox::empty() const { + return lower > upper; + } + +#if defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline bool BBox::empty() const { + return !all(le_mask(lower,upper)); + } + template<> __forceinline bool BBox::empty() const { + return !all(le_mask(lower,upper)); + } +#endif + + /*! tests if box is finite */ + __forceinline bool isvalid( const BBox& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); + } + + /*! tests if box is finite and non-empty*/ + __forceinline bool isvalid_non_empty( const BBox& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); + } + + /*! tests if box has finite entries */ + __forceinline bool is_finite( const BBox& b) { + return is_finite(b.lower) && is_finite(b.upper); + } + + /*! test if point contained in box */ + __forceinline bool inside ( const BBox& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } + + /*! computes the center of the box */ + template __forceinline const T center2(const BBox& box) { return box.lower + box.upper; } + template __forceinline const T center (const BBox& box) { return internal::divideByTwo(center2(box)); } + + /*! computes the volume of a bounding box */ + __forceinline float volume ( const BBox& b ) { return reduce_mul(b.size()); } + __forceinline float safeVolume( const BBox& b ) { if (b.empty()) return 0.0f; else return volume(b); } + + /*! computes the volume of a bounding box */ + __forceinline float volume( const BBox& b ) { return reduce_mul(b.size()); } + + /*! computes the surface area of a bounding box */ + template __forceinline const T area( const BBox >& b ) { const Vec2 d = b.size(); return d.x*d.y; } + + template __forceinline const T halfArea( const BBox >& b ) { return halfArea(b.size()); } + template __forceinline const T area( const BBox >& b ) { return T(2)*halfArea(b); } + + __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } + + __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } + + template __forceinline float safeArea( const BBox& b ) { if (b.empty()) return 0.0f; else return area(b); } + + template __forceinline float expectedApproxHalfArea(const BBox& box) { + return halfArea(box); + } + + /*! merges bounding boxes and points */ + template __forceinline const BBox merge( const BBox& a, const T& b ) { return BBox(min(a.lower, b ), max(a.upper, b )); } + template __forceinline const BBox merge( const T& a, const BBox& b ) { return BBox(min(a , b.lower), max(a , b.upper)); } + template __forceinline const BBox merge( const BBox& a, const BBox& b ) { return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); } + + /*! Merges three boxes. */ + template __forceinline const BBox merge( const BBox& a, const BBox& b, const BBox& c ) { return merge(a,merge(b,c)); } + + /*! Merges four boxes. */ + template __forceinline BBox merge(const BBox& a, const BBox& b, const BBox& c, const BBox& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! Comparison Operators */ + template __forceinline bool operator==( const BBox& a, const BBox& b ) { return a.lower == b.lower && a.upper == b.upper; } + template __forceinline bool operator!=( const BBox& a, const BBox& b ) { return a.lower != b.lower || a.upper != b.upper; } + + /*! scaling */ + template __forceinline BBox operator *( const float& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } + template __forceinline BBox operator *( const T& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } + + /*! translations */ + template __forceinline BBox operator +( const BBox& a, const BBox& b ) { return BBox(a.lower+b.lower,a.upper+b.upper); } + template __forceinline BBox operator -( const BBox& a, const BBox& b ) { return BBox(a.lower-b.lower,a.upper-b.upper); } + template __forceinline BBox operator +( const BBox& a, const T & b ) { return BBox(a.lower+b ,a.upper+b ); } + template __forceinline BBox operator -( const BBox& a, const T & b ) { return BBox(a.lower-b ,a.upper-b ); } + + /*! extension */ + template __forceinline BBox enlarge(const BBox& a, const T& b) { return BBox(a.lower-b, a.upper+b); } + + /*! intersect bounding boxes */ + template __forceinline const BBox intersect( const BBox& a, const BBox& b ) { return BBox(max(a.lower, b.lower), min(a.upper, b.upper)); } + template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c ) { return intersect(a,intersect(b,c)); } + template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c, const BBox& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + /*! subtract bounds from each other */ + template __forceinline void subtract(const BBox& a, const BBox& b, BBox& c, BBox& d) + { + c.lower = a.lower; + c.upper = min(a.upper,b.lower); + d.lower = max(a.lower,b.upper); + d.upper = a.upper; + } + + /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ + template __inline bool disjoint( const BBox& a, const BBox& b ) { return intersect(a,b).empty(); } + template __inline bool disjoint( const BBox& a, const T& b ) { return disjoint(a,BBox(b)); } + template __inline bool disjoint( const T& a, const BBox& b ) { return disjoint(BBox(a),b); } + + /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ + template __inline bool conjoint( const BBox& a, const BBox& b ) { return !intersect(a,b).empty(); } + template __inline bool conjoint( const BBox& a, const T& b ) { return conjoint(a,BBox(b)); } + template __inline bool conjoint( const T& a, const BBox& b ) { return conjoint(BBox(a),b); } + + /*! subset relation */ + template __inline bool subset( const BBox& a, const BBox& b ) + { + for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; + for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; + return true; + } + + template<> __inline bool subset( const BBox& a, const BBox& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + template<> __inline bool subset( const BBox& a, const BBox& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + /*! blending */ + template + __forceinline BBox lerp(const BBox& b0, const BBox& b1, const float t) { + return BBox(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); + } + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const BBox& box) { + return cout << "[" << box.lower << "; " << box.upper << "]"; + } + + /*! default template instantiations */ + typedef BBox BBox1f; + typedef BBox BBox2f; + typedef BBox BBox2fa; + typedef BBox BBox3f; + typedef BBox BBox3fa; + typedef BBox BBox3fx; + typedef BBox BBox3ff; +} + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined (__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template + __forceinline BBox>> transpose(const BBox3fa* bounds); + + template<> + __forceinline BBox> transpose<4>(const BBox3fa* bounds) + { + BBox> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } + +#if defined(__AVX__) + template<> + __forceinline BBox> transpose<8>(const BBox3fa* bounds) + { + BBox> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + (vfloat4&)bounds[4].lower, + (vfloat4&)bounds[5].lower, + (vfloat4&)bounds[6].lower, + (vfloat4&)bounds[7].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + (vfloat4&)bounds[4].upper, + (vfloat4&)bounds[5].upper, + (vfloat4&)bounds[6].upper, + (vfloat4&)bounds[7].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } +#endif + + template + __forceinline BBox3fa merge(const BBox3fa* bounds); + + template<> + __forceinline BBox3fa merge<4>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), + min(bounds[2].lower,bounds[3].lower)); + const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), + max(bounds[2].upper,bounds[3].upper)); + return BBox3fa(lower,upper); + } + +#if defined(__AVX__) + template<> + __forceinline BBox3fa merge<8>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), + min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); + const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), + max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); + return BBox3fa(lower,upper); + } +#endif +} + diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h new file mode 100644 index 0000000000..f52015fb88 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/col3.h @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Col3 + { + T r, g, b; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 ( ) { } + __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } + __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } + + __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} + __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} + __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} + __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} + __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} + }; + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const Col3& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } + + /*! default template instantiations */ + typedef Col3 Col3uc; + typedef Col3 Col3f; +} diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h new file mode 100644 index 0000000000..90df293f8e --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/col4.h @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Col4 + { + T r, g, b, a; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 ( ) { } + __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } + __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } + + __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} + __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} + __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} + __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} + __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} + }; + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const Col4& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; + } + + /*! default template instantiations */ + typedef Col4 Col4uc; + typedef Col4 Col4f; +} diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h new file mode 100644 index 0000000000..c3083e4fc0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/color.h @@ -0,0 +1,257 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "constants.h" +#include "col3.h" +#include "col4.h" + +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color4 + { + union { + __m128 m128; + struct { float r,g,b,a; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4 () {} + __forceinline Color4 ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} + + __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } + __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } + + __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} + __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + d.a = (uint8_t)(s[3]); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color + { + union { + __m128 m128; + struct { float r,g,b; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color () {} + __forceinline Color ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} + + __forceinline Color ( const Color& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } + + __forceinline Color ( const Color4& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + d.a = 255; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a ) { return a; } + __forceinline const Color operator -( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline const Color abs ( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline const Color rcp ( const Color& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Color)reciprocal; +#else +#if defined(__AVX512VL__) + const Color r = _mm_rcp14_ps(a.m128); +#else + const Color r = _mm_rcp_ps(a.m128); +#endif + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif //defined(__aarch64__) && defined(BUILD_IOS) + } + __forceinline const Color rsqrt( const Color& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif //defined(__aarch64__) && defined(BUILD_IOS) + } + __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } + __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } + __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } + __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } + + __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } + __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } + __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } + __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } + __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } + __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } + __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + __forceinline bool operator < ( const Color& a, const Color& b ) { + if (a.r != b.r) return a.r < b.r; + if (a.g != b.g) return a.g < b.g; + if (a.b != b.b) return a.b < b.b; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color select( bool s, const Color& t, const Color& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Special Operators + //////////////////////////////////////////////////////////////////////////////// + + /*! computes luminance of a color */ + __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } + + /*! output operator */ + __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp new file mode 100644 index 0000000000..eeff131664 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/constants.cpp @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#if defined(__aarch64__) +#include +#endif + +#include "constants.h" + +namespace embree +{ + TrueTy True; + FalseTy False; + ZeroTy zero; + OneTy one; + NegInfTy neg_inf; + PosInfTy inf; + PosInfTy pos_inf; + NaNTy nan; + UlpTy ulp; + PiTy pi; + OneOverPiTy one_over_pi; + TwoPiTy two_pi; + OneOverTwoPiTy one_over_two_pi; + FourPiTy four_pi; + OneOverFourPiTy one_over_four_pi; + StepTy step; + ReverseStepTy reverse_step; + EmptyTy empty; + UndefinedTy undefined; + +#if defined(__aarch64__) +const uint32x4_t movemask_mask = { 1, 2, 4, 8 }; +const uint32x4_t vzero = { 0, 0, 0, 0 }; +const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; +const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11}; +const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15}; +const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; +const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f }; +const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f }; +const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY }; +const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY }; +#endif + +} diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h new file mode 100644 index 0000000000..e80abec80f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/constants.h @@ -0,0 +1,239 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +#include + +#define _USE_MATH_DEFINES +#include // using cmath causes issues under Windows +#include +#include + +// Math constants may not be defined in libcxx + mingw + strict C++ standard +#if defined(__MINGW32__) + +// TODO(LTE): use constexpr +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#ifndef M_1_PI +#define M_1_PI 0.31830988618379067154 +#endif + +#endif // __MINGW32__ + +namespace embree +{ + static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; + static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail + + /* we consider floating point numbers in that range as valid input numbers */ + static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; + + struct TrueTy { + __forceinline operator bool( ) const { return true; } + }; + + extern MAYBE_UNUSED TrueTy True; + + struct FalseTy { + __forceinline operator bool( ) const { return false; } + }; + + extern MAYBE_UNUSED FalseTy False; + + struct ZeroTy + { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator int8_t ( ) const { return 0; } + __forceinline operator uint8_t ( ) const { return 0; } + }; + + extern MAYBE_UNUSED ZeroTy zero; + + struct OneTy + { + __forceinline operator double ( ) const { return 1; } + __forceinline operator float ( ) const { return 1; } + __forceinline operator long long( ) const { return 1; } + __forceinline operator unsigned long long( ) const { return 1; } + __forceinline operator long ( ) const { return 1; } + __forceinline operator unsigned long ( ) const { return 1; } + __forceinline operator int ( ) const { return 1; } + __forceinline operator unsigned int ( ) const { return 1; } + __forceinline operator short ( ) const { return 1; } + __forceinline operator unsigned short ( ) const { return 1; } + __forceinline operator int8_t ( ) const { return 1; } + __forceinline operator uint8_t ( ) const { return 1; } + }; + + extern MAYBE_UNUSED OneTy one; + + struct NegInfTy + { + __forceinline operator double ( ) const { return -std::numeric_limits::infinity(); } + __forceinline operator float ( ) const { return -std::numeric_limits::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits::min(); } + __forceinline operator long ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits::min(); } + __forceinline operator int ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits::min(); } + __forceinline operator short ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits::min(); } + __forceinline operator int8_t ( ) const { return std::numeric_limits::min(); } + __forceinline operator uint8_t ( ) const { return std::numeric_limits::min(); } + + }; + + extern MAYBE_UNUSED NegInfTy neg_inf; + + struct PosInfTy + { + __forceinline operator double ( ) const { return std::numeric_limits::infinity(); } + __forceinline operator float ( ) const { return std::numeric_limits::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits::max(); } + __forceinline operator long ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits::max(); } + __forceinline operator int ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits::max(); } + __forceinline operator short ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits::max(); } + __forceinline operator int8_t ( ) const { return std::numeric_limits::max(); } + __forceinline operator uint8_t ( ) const { return std::numeric_limits::max(); } + }; + + extern MAYBE_UNUSED PosInfTy inf; + extern MAYBE_UNUSED PosInfTy pos_inf; + + struct NaNTy + { + __forceinline operator double( ) const { return std::numeric_limits::quiet_NaN(); } + __forceinline operator float ( ) const { return std::numeric_limits::quiet_NaN(); } + }; + + extern MAYBE_UNUSED NaNTy nan; + + struct UlpTy + { + __forceinline operator double( ) const { return std::numeric_limits::epsilon(); } + __forceinline operator float ( ) const { return std::numeric_limits::epsilon(); } + }; + + extern MAYBE_UNUSED UlpTy ulp; + + struct PiTy + { + __forceinline operator double( ) const { return double(M_PI); } + __forceinline operator float ( ) const { return float(M_PI); } + }; + + extern MAYBE_UNUSED PiTy pi; + + struct OneOverPiTy + { + __forceinline operator double( ) const { return double(M_1_PI); } + __forceinline operator float ( ) const { return float(M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverPiTy one_over_pi; + + struct TwoPiTy + { + __forceinline operator double( ) const { return double(2.0*M_PI); } + __forceinline operator float ( ) const { return float(2.0*M_PI); } + }; + + extern MAYBE_UNUSED TwoPiTy two_pi; + + struct OneOverTwoPiTy + { + __forceinline operator double( ) const { return double(0.5*M_1_PI); } + __forceinline operator float ( ) const { return float(0.5*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + + struct FourPiTy + { + __forceinline operator double( ) const { return double(4.0*M_PI); } + __forceinline operator float ( ) const { return float(4.0*M_PI); } + }; + + extern MAYBE_UNUSED FourPiTy four_pi; + + struct OneOverFourPiTy + { + __forceinline operator double( ) const { return double(0.25*M_1_PI); } + __forceinline operator float ( ) const { return float(0.25*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + + struct StepTy { + }; + + extern MAYBE_UNUSED StepTy step; + + struct ReverseStepTy { + }; + + extern MAYBE_UNUSED ReverseStepTy reverse_step; + + struct EmptyTy { + }; + + extern MAYBE_UNUSED EmptyTy empty; + + struct FullTy { + }; + + extern MAYBE_UNUSED FullTy full; + + struct UndefinedTy { + }; + + extern MAYBE_UNUSED UndefinedTy undefined; + +#if defined(__aarch64__) + extern const uint32x4_t movemask_mask; + extern const uint32x4_t vzero; + extern const uint32x4_t v0x80000000; + extern const uint32x4_t v0x7fffffff; + extern const uint32x4_t v000F; + extern const uint32x4_t v00F0; + extern const uint32x4_t v00FF; + extern const uint32x4_t v0F00; + extern const uint32x4_t v0F0F; + extern const uint32x4_t v0FF0; + extern const uint32x4_t v0FFF; + extern const uint32x4_t vF000; + extern const uint32x4_t vF00F; + extern const uint32x4_t vF0F0; + extern const uint32x4_t vF0FF; + extern const uint32x4_t vFF00; + extern const uint32x4_t vFF0F; + extern const uint32x4_t vFFF0; + extern const uint32x4_t vFFFF; + extern const uint8x16_t v0022; + extern const uint8x16_t v1133; + extern const uint8x16_t v0101; + extern const float32x4_t vOne; + extern const float32x4_t vmOne; + extern const float32x4_t vInf; + extern const float32x4_t vmInf; +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h new file mode 100644 index 0000000000..f06478e881 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/interval.h @@ -0,0 +1,161 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" +#include "bbox.h" + +namespace embree +{ + template + struct Interval + { + V lower, upper; + + __forceinline Interval() {} + __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } + __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline Interval(const V& a) : lower(a), upper(a) {} + __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} + __forceinline Interval(const BBox& a) : lower(a.lower), upper(a.upper) {} + + /*! tests if box is empty */ + //__forceinline bool empty() const { return lower > upper; } + + /*! computes the size of the interval */ + __forceinline V size() const { return upper - lower; } + + __forceinline V center() const { return 0.5f*(lower+upper); } + + __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { + return Interval(a.lower+b.lower,a.upper+b.upper); + } + + __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { + return Interval(a.lower-b.upper,a.upper-b.lower); + } + + __forceinline friend Interval operator -( const Interval& a, const V& b ) { + return Interval(a.lower-b,a.upper-b); + } + + __forceinline friend Interval operator *( const Interval& a, const Interval& b ) + { + const V ll = a.lower*b.lower; + const V lu = a.lower*b.upper; + const V ul = a.upper*b.lower; + const V uu = a.upper*b.upper; + return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b) { + return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { + return merge(merge(a,b),c); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! intersect bounding boxes */ + __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { + return cout << "[" << a.lower << ", " << a.upper << "]"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} + }; + + __forceinline bool isEmpty(const Interval& v) { + return v.lower > v.upper; + } + + __forceinline vboolx isEmpty(const Interval& v) { + return v.lower > v.upper; + } + + /*! subset relation */ + template __forceinline bool subset( const Interval& a, const Interval& b ) { + return (a.lower > b.lower) && (a.upper < b.upper); + } + + template __forceinline bool subset( const Vec2>& a, const Vec2>& b ) { + return subset(a.x,b.x) && subset(a.y,b.y); + } + + template __forceinline const Vec2> intersect( const Vec2>& a, const Vec2>& b ) { + return Vec2>(intersect(a.x,b.x),intersect(a.y,b.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Interval select ( bool s, const Interval& t, const Interval& f ) { + return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + template __forceinline Interval select ( const typename T::Bool& s, const Interval& t, const Interval& f ) { + return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + __forceinline int numRoots(const Interval& p0, const Interval& p1) + { + float eps = 1E-4f; + bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; + bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; + return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); + } + + typedef Interval Interval1f; + typedef Vec2> Interval2f; + typedef Vec3> Interval3f; + +inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } + +inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } + +#define TWO_PI (2.0*M_PI) +inline Interval1f sin(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float sinLower = sin(interval.lower); + float sinUpper = sin(interval.upper); + if (sinLower > sinUpper) swap(sinLower, sinUpper); + if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; + if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; + return Interval1f(sinLower, sinUpper); +} + +inline Interval1f cos(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float cosLower = cos(interval.lower); + float cosUpper = cos(interval.upper); + if (cosLower > cosUpper) swap(cosLower, cosUpper); + if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; + return Interval1f(cosLower, cosUpper); +} +#undef TWO_PI +} diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h new file mode 100644 index 0000000000..95df4a918d --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/lbbox.h @@ -0,0 +1,289 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "range.h" + +namespace embree +{ + template + __forceinline std::pair globalLinear(const std::pair& v, const BBox1f& dt) + { + const float rcp_dt_size = float(1.0f)/dt.size(); + const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); + const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); + return std::make_pair(g0,g1); + } + + template + struct LBBox + { + public: + __forceinline LBBox () {} + + template + __forceinline LBBox ( const LBBox& other ) + : bounds0(other.bounds0), bounds1(other.bounds1) {} + + __forceinline LBBox& operator= ( const LBBox& other ) { + bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; + } + + __forceinline LBBox (EmptyTy) + : bounds0(EmptyTy()), bounds1(EmptyTy()) {} + + __forceinline explicit LBBox ( const BBox& bounds) + : bounds0(bounds), bounds1(bounds) { } + + __forceinline LBBox ( const BBox& bounds0, const BBox& bounds1) + : bounds0(bounds0), bounds1(bounds1) { } + + LBBox ( const avector>& bounds ) + { + assert(bounds.size()); + BBox b0 = bounds.front(); + BBox b1 = bounds.back(); + for (size_t i=1; i bt = lerp(b0,b1,f); + const T dlower = min(bounds[i].lower-bt.lower,T(zero)); + const T dupper = max(bounds[i].upper-bt.upper,T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) + { + const float lower = time_range.lower*numTimeSegments; + const float upper = time_range.upper*numTimeSegments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const int ilower = (int)ilowerf; + const int iupper = (int)iupperf; + + const BBox blower0 = bounds(ilower); + const BBox bupper1 = bounds(iupper); + + if (iupper-ilower == 1) { + bounds0 = lerp(blower0, bupper1, lower-ilowerf); + bounds1 = lerp(bupper1, blower0, iupperf-upper); + return; + } + + const BBox blower1 = bounds(ilower+1); + const BBox bupper0 = bounds(iupper-1); + BBox b0 = lerp(blower0, blower1, lower-ilowerf); + BBox b1 = lerp(bupper1, bupper0, iupperf-upper); + + for (int i = ilower+1; i < iupper; i++) + { + const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); + const BBox bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) + { + /* normalize global time_range_in to local geom_time_range */ + const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), + (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); + + const float lower = time_range.lower*geom_time_segments; + const float upper = time_range.upper*geom_time_segments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const float ilowerfc = max(0.0f,ilowerf); + const float iupperfc = min(iupperf,geom_time_segments); + const int ilowerc = (int)ilowerfc; + const int iupperc = (int)iupperfc; + assert(iupperc-ilowerc > 0); + + /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ + const int ilower_iter = max(-1,(int)ilowerf); + const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); + + const BBox blower0 = bounds(ilowerc); + const BBox bupper1 = bounds(iupperc); + if (iupper_iter-ilower_iter == 1) { + bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); + bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); + return; + } + + const BBox blower1 = bounds(ilowerc+1); + const BBox bupper0 = bounds(iupperc-1); + BBox b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); + BBox b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); + + for (int i = ilower_iter+1; i < iupper_iter; i++) + { + const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); + const BBox bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const range& time_range, int numTimeSegments) + { + const int ilower = time_range.begin(); + const int iupper = time_range.end(); + + BBox b0 = bounds(ilower); + BBox b1 = bounds(iupper); + + if (iupper-ilower == 1) + { + bounds0 = b0; + bounds1 = b1; + return; + } + + for (int i = ilower+1; i bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + public: + + __forceinline bool empty() const { + return bounds().empty(); + } + + __forceinline BBox bounds () const { + return merge(bounds0,bounds1); + } + + __forceinline BBox interpolate( const float t ) const { + return lerp(bounds0,bounds1,t); + } + + __forceinline LBBox interpolate( const BBox1f& dt ) const { + return LBBox(interpolate(dt.lower),interpolate(dt.upper)); + } + + __forceinline void extend( const LBBox& other ) { + bounds0.extend(other.bounds0); + bounds1.extend(other.bounds1); + } + + __forceinline float expectedHalfArea() const; + + __forceinline float expectedHalfArea(const BBox1f& dt) const { + return interpolate(dt).expectedHalfArea(); + } + + __forceinline float expectedApproxHalfArea() const { + return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); + } + + /* calculates bounds for [0,1] time range from bounds in dt time range */ + __forceinline LBBox global(const BBox1f& dt) const + { + const float rcp_dt_size = 1.0f/dt.size(); + const BBox b0 = interpolate(-dt.lower*rcp_dt_size); + const BBox b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); + return LBBox(b0,b1); + } + + /*! Comparison Operators */ + //template friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + //template friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { + return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; + } + + public: + BBox bounds0, bounds1; + }; + + /*! tests if box is finite */ + template + __forceinline bool isvalid( const LBBox& v ) { + return isvalid(v.bounds0) && isvalid(v.bounds1); + } + + template + __forceinline bool isvalid_non_empty( const LBBox& v ) { + return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); + } + + template + __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) + { + const T da = a1-a0; + const T db = b1-b0; + return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); + } + + template<> __forceinline float LBBox::expectedHalfArea() const + { + const Vec3fa d0 = bounds0.size(); + const Vec3fa d1 = bounds1.size(); + return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), + Vec3fa(d1.x,d1.y,d1.z), + Vec3fa(d0.y,d0.z,d0.x), + Vec3fa(d1.y,d1.z,d1.x))); + } + + template + __forceinline float expectedApproxHalfArea(const LBBox& box) { + return box.expectedApproxHalfArea(); + } + + template + __forceinline LBBox merge(const LBBox& a, const LBBox& b) { + return LBBox(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); + } + + /*! subset relation */ + template __inline bool subset( const LBBox& a, const LBBox& b ) { + return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); + } + + /*! default template instantiations */ + typedef LBBox LBBox1f; + typedef LBBox LBBox2f; + typedef LBBox LBBox3f; + typedef LBBox LBBox3fa; + typedef LBBox LBBox3fx; +} diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h new file mode 100644 index 0000000000..b9a382962c --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/linearspace2.h @@ -0,0 +1,148 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 2D Linear Transform (2x2 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template struct LinearSpace2 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace2 ( ) {} + __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } + __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } + + template __forceinline LinearSpace2( const LinearSpace2& s ) : vx(s.vx), vy(s.vy) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace2(const Vector& vx, const Vector& vy) + : vx(vx), vy(vy) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, + const Scalar& m10, const Scalar& m11) + : vx(m00,m10), vy(m01,m11) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} + __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace2 scale(const Vector& s) { + return LinearSpace2(s.x, 0, + 0 , s.y); + } + + /*! return matrix for rotation */ + static __forceinline LinearSpace2 rotate(const Scalar& r) { + Scalar s = sin(r), c = cos(r); + return LinearSpace2(c, -s, + s, c); + } + + /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ + LinearSpace2 orthogonal() const + { + LinearSpace2 m = *this; + + // mirrored? + Scalar mirror(one); + if (m.det() < Scalar(zero)) { + m.vx = -m.vx; + mirror = -mirror; + } + + // rotation + for (int i = 0; i < 99; i++) { + const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); + const LinearSpace2 d = m_next - m; + m = m_next; + // norm^2 of difference small enough? + if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) + break; + } + + // rotation * mirror_x + return LinearSpace2(mirror*m.vx, m.vy); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy; + }; + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace2 operator -( const LinearSpace2& a ) { return LinearSpace2(-a.vx,-a.vy); } + template __forceinline LinearSpace2 operator +( const LinearSpace2& a ) { return LinearSpace2(+a.vx,+a.vy); } + template __forceinline LinearSpace2 rcp ( const LinearSpace2& a ) { return a.inverse(); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace2 operator +( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx+b.vx,a.vy+b.vy); } + template __forceinline LinearSpace2 operator -( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx-b.vx,a.vy-b.vy); } + + template __forceinline LinearSpace2 operator*(const typename T::Scalar & a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } + template __forceinline T operator*(const LinearSpace2& a, const T & b) { return b.x*a.vx + b.y*a.vy; } + template __forceinline LinearSpace2 operator*(const LinearSpace2& a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } + + template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const typename T::Scalar & b) { return LinearSpace2(a.vx/b, a.vy/b); } + template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const LinearSpace2& b) { return a * rcp(b); } + + template __forceinline LinearSpace2& operator *=( LinearSpace2& a, const LinearSpace2& b ) { return a = a * b; } + template __forceinline LinearSpace2& operator /=( LinearSpace2& a, const LinearSpace2& b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx == b.vx && a.vy == b.vy; } + template __forceinline bool operator !=( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx != b.vx || a.vy != b.vy; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const LinearSpace2& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace2 LinearSpace2f; + typedef LinearSpace2 LinearSpace2fa; +} diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h new file mode 100644 index 0000000000..12b5bb776b --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/linearspace3.h @@ -0,0 +1,213 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "quaternion.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 3D Linear Transform (3x3 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template struct LinearSpace3 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace3 ( ) {} + __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } + __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } + + template __forceinline LinearSpace3( const LinearSpace3& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) + : vx(vx), vy(vy), vz(vz) {} + + /*! construction from quaternion */ + __forceinline LinearSpace3( const QuaternionT& q ) + : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) + , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) + , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, + const Scalar& m10, const Scalar& m11, const Scalar& m12, + const Scalar& m20, const Scalar& m21, const Scalar& m22) + : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } + + /*! returns third row of matrix */ + __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} + __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace3 scale(const Vector& s) { + return LinearSpace3(s.x, 0, 0, + 0 , s.y, 0, + 0 , 0, s.z); + } + + /*! return matrix for rotation around arbitrary axis */ + static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { + Vector u = normalize(_u); + Scalar s = sin(r), c = cos(r); + return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, + u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, + u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy,vz; + }; + + /*! compute transposed matrix */ + template<> __forceinline const LinearSpace3 LinearSpace3::transposed() const { + vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); + return LinearSpace3(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); + } + + template + __forceinline const LinearSpace3 transposed(const LinearSpace3& xfm) { + return xfm.transposed(); + } + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 operator -( const LinearSpace3& a ) { return LinearSpace3(-a.vx,-a.vy,-a.vz); } + template __forceinline LinearSpace3 operator +( const LinearSpace3& a ) { return LinearSpace3(+a.vx,+a.vy,+a.vz); } + template __forceinline LinearSpace3 rcp ( const LinearSpace3& a ) { return a.inverse(); } + + /* constructs a coordinate frame form a normalized normal */ + template __forceinline LinearSpace3 frame(const T& N) + { + const T dx0(0,N.z,-N.y); + const T dx1(-N.z,0,N.x); + const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3(dx,dy,N); + } + + /* constructs a coordinate frame from a normal and approximate x-direction */ + template __forceinline LinearSpace3 frame(const T& N, const T& dxi) + { + if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel + const T dx = normalize(cross(dxi,N)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3(dx,dy,N); + } + + /* clamps linear space to range -1 to +1 */ + template __forceinline LinearSpace3 clamp(const LinearSpace3& space) { + return LinearSpace3(clamp(space.vx,T(-1.0f),T(1.0f)), + clamp(space.vy,T(-1.0f),T(1.0f)), + clamp(space.vz,T(-1.0f),T(1.0f))); + } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 operator +( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } + template __forceinline LinearSpace3 operator -( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } + + template __forceinline LinearSpace3 operator*(const typename T::Scalar & a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } + template __forceinline T operator*(const LinearSpace3& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } + template __forceinline LinearSpace3 operator*(const LinearSpace3& a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } + + template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const typename T::Scalar & b) { return LinearSpace3(a.vx/b, a.vy/b, a.vz/b); } + template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const LinearSpace3& b) { return a * rcp(b); } + + template __forceinline LinearSpace3& operator *=( LinearSpace3& a, const LinearSpace3& b ) { return a = a * b; } + template __forceinline LinearSpace3& operator /=( LinearSpace3& a, const LinearSpace3& b ) { return a = a / b; } + + template __forceinline T xfmPoint (const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template __forceinline T xfmVector(const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template __forceinline T xfmNormal(const LinearSpace3& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } + template __forceinline bool operator !=( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 select ( const typename T::Scalar::Bool& s, const LinearSpace3& t, const LinearSpace3& f ) { + return LinearSpace3(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); + } + + /*! blending */ + template + __forceinline LinearSpace3 lerp(const LinearSpace3& l0, const LinearSpace3& l1, const float t) + { + return LinearSpace3(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const LinearSpace3& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace3 LinearSpace3f; + typedef LinearSpace3 LinearSpace3fa; + typedef LinearSpace3 LinearSpace3fx; + typedef LinearSpace3 LinearSpace3ff; + + template using LinearSpace3vf = LinearSpace3>>; + typedef LinearSpace3>> LinearSpace3vf4; + typedef LinearSpace3>> LinearSpace3vf8; + typedef LinearSpace3>> LinearSpace3vf16; + + /*! blending */ + template + __forceinline LinearSpace3 lerp(const LinearSpace3& l0, + const LinearSpace3& l1, + const S& t) + { + return LinearSpace3(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + +} diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h new file mode 100644 index 0000000000..6d54abd44d --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/math.h @@ -0,0 +1,451 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "constants.h" +#include + +#if defined(__ARM_NEON) +#include "SSE2NEON.h" +#if defined(NEON_AVX2_EMULATION) +#include "AVX2NEON.h" +#endif +#else +#include +#include +#include +#endif + +#if defined(__WIN32__) && !defined(__MINGW32__) +#if (__MSV_VER <= 1700) +namespace std +{ + __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } + __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } + __forceinline bool isfinite (const float x) { return _finite(x) != 0; } +} +#endif +#endif + +namespace embree +{ + __forceinline bool isvalid ( const float& v ) { + return (v > -FLT_LARGE) & (v < +FLT_LARGE); + } + + __forceinline int cast_f2i(float f) { + union { float f; int i; } v; v.f = f; return v.i; + } + + __forceinline float cast_i2f(int i) { + union { float f; int i; } v; v.i = i; return v.f; + } + + __forceinline int toInt (const float& a) { return int(a); } + __forceinline float toFloat(const int& a) { return float(a); } + +#if defined(__WIN32__) && !defined(__MINGW32__) + __forceinline bool finite ( const float x ) { return _finite(x) != 0; } +#endif + + __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } + __forceinline float sqr ( const float x ) { return x*x; } + + __forceinline float rcp ( const float x ) + { +#if defined(__aarch64__) + // Move scalar to vector register and do rcp. + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#else + + const __m128 a = _mm_set_ss(x); + +#if defined(__AVX512VL__) + const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rcp_ss(a); +#endif + +#if defined(__AVX2__) + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +#else + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +#endif + +#endif //defined(__aarch64__) + } + + __forceinline float signmsk ( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = 0x80000000; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#endif + } + __forceinline float xorf( const float x, const float y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128 b; + a[0] = x; + b[0] = y; + a = _mm_xor_ps(a, b); + return a[0]; +#else + return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); +#endif + } + __forceinline float andf( const float x, const unsigned y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = y; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); +#endif + } + __forceinline float rsqrt( const float x ) + { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + a[0] = x; + __m128 value = _mm_rsqrt_ps(a); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + return value[0]; +#else + + const __m128 a = _mm_set_ss(x); +#if defined(__AVX512VL__) + const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rsqrt_ss(a); +#endif + const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), + _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + return _mm_cvtss_f32(c); +#endif + } + +#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__) + __forceinline float nextafter(float x, float y) { if ((x0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } + __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } + __forceinline int roundf(float f) { return (int)(f + 0.5f); } +#else + __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } + __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } +#endif + + __forceinline float abs ( const float x ) { return ::fabsf(x); } + __forceinline float acos ( const float x ) { return ::acosf (x); } + __forceinline float asin ( const float x ) { return ::asinf (x); } + __forceinline float atan ( const float x ) { return ::atanf (x); } + __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } + __forceinline float cos ( const float x ) { return ::cosf (x); } + __forceinline float cosh ( const float x ) { return ::coshf (x); } + __forceinline float exp ( const float x ) { return ::expf (x); } + __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } + __forceinline float log ( const float x ) { return ::logf (x); } + __forceinline float log10( const float x ) { return ::log10f(x); } + __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } + __forceinline float sin ( const float x ) { return ::sinf (x); } + __forceinline float sinh ( const float x ) { return ::sinhf (x); } + __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } + __forceinline float tan ( const float x ) { return ::tanf (x); } + __forceinline float tanh ( const float x ) { return ::tanhf (x); } + __forceinline float floor( const float x ) { return ::floorf (x); } + __forceinline float ceil ( const float x ) { return ::ceilf (x); } + __forceinline float frac ( const float x ) { return x-floor(x); } + + __forceinline double abs ( const double x ) { return ::fabs(x); } + __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } + __forceinline double acos ( const double x ) { return ::acos (x); } + __forceinline double asin ( const double x ) { return ::asin (x); } + __forceinline double atan ( const double x ) { return ::atan (x); } + __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } + __forceinline double cos ( const double x ) { return ::cos (x); } + __forceinline double cosh ( const double x ) { return ::cosh (x); } + __forceinline double exp ( const double x ) { return ::exp (x); } + __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } + __forceinline double log ( const double x ) { return ::log (x); } + __forceinline double log10( const double x ) { return ::log10(x); } + __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } + __forceinline double rcp ( const double x ) { return 1.0/x; } + __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } + __forceinline double sin ( const double x ) { return ::sin (x); } + __forceinline double sinh ( const double x ) { return ::sinh (x); } + __forceinline double sqr ( const double x ) { return x*x; } + __forceinline double sqrt ( const double x ) { return ::sqrt (x); } + __forceinline double tan ( const double x ) { return ::tan (x); } + __forceinline double tanh ( const double x ) { return ::tanh (x); } + __forceinline double floor( const double x ) { return ::floor (x); } + __forceinline double ceil ( const double x ) { return ::ceil (x); } + +#if defined(__aarch64__) + __forceinline float mini(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_min_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) + __forceinline float mini(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_min_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + +#if defined(__aarch64__) + __forceinline float maxi(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_max_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) + __forceinline float maxi(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_max_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + + template + __forceinline T twice(const T& a) { return a+a; } + + __forceinline int min(int a, int b) { return a __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } + template __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } + template __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } + + template __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } + template __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } + template __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } + + __forceinline int max(int a, int b) { return a __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } + template __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } + template __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } + + template __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } + template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } + template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } + +#if defined(__MACOSX__) + __forceinline ssize_t min(ssize_t a, ssize_t b) { return a __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } + template __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } + + template __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } + template __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } + template __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } + template __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } + +#if defined(__AVX2__) + __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } +#elif defined (__aarch64__) && defined(__clang__) +#pragma clang fp contract(fast) + + +__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } +__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } +__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } +__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } + +#pragma clang fp contract(on) +#else + __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } + __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } + __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} + __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } +#endif + + /*! random functions */ + template T random() { return T(0); } +#if defined(_WIN32) + template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } +#else + template<> __forceinline int random() { return int(rand()); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } +#endif + template<> __forceinline float random() { return rand()/float(RAND_MAX); } + template<> __forceinline double random() { return rand()/double(RAND_MAX); } + +#if _WIN32 + __forceinline double drand48() { + return double(rand())/double(RAND_MAX); + } + + __forceinline void srand48(long seed) { + return srand(seed); + } +#endif + + /*! selects */ + __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } + __forceinline int select(bool s, int t, int f) { return s ? t : f; } + __forceinline float select(bool s, float t, float f) { return s ? t : f; } + + __forceinline bool all(bool s) { return s; } + + __forceinline float lerp(const float v0, const float v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + template + __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { + return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); + } + + /*! exchange */ + template __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } + + + template __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) { +#if 1//!defined(__aarch64__) + return msub(a,b,c*d); +#else + return nmadd(c,d,a*b); +#endif + } + + /*! bit reverse operation */ + template + __forceinline T bitReverse(const T& vin) + { + T v = vin; + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = ( v >> 16 ) | ( v << 16); + return v; + } + + /*! bit interleave operation */ + template + __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) + { + T x = xin, y = yin, z = zin; + x = (x | (x << 16)) & 0x030000FF; + x = (x | (x << 8)) & 0x0300F00F; + x = (x | (x << 4)) & 0x030C30C3; + x = (x | (x << 2)) & 0x09249249; + + y = (y | (y << 16)) & 0x030000FF; + y = (y | (y << 8)) & 0x0300F00F; + y = (y | (y << 4)) & 0x030C30C3; + y = (y | (y << 2)) & 0x09249249; + + z = (z | (z << 16)) & 0x030000FF; + z = (z | (z << 8)) & 0x0300F00F; + z = (z | (z << 4)) & 0x030C30C3; + z = (z | (z << 2)) & 0x09249249; + + return x | (y << 1) | (z << 2); + } + +#if defined(__AVX2__) && !defined(__aarch64__) + + template<> + __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) + { + const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); + const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); + const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); + return xx | yy | zz; + } + +#endif + + /*! bit interleave operation for 64bit data types*/ + template + __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ + T x = xin & 0x1fffff; + T y = yin & 0x1fffff; + T z = zin & 0x1fffff; + + x = (x | x << 32) & 0x1f00000000ffff; + x = (x | x << 16) & 0x1f0000ff0000ff; + x = (x | x << 8) & 0x100f00f00f00f00f; + x = (x | x << 4) & 0x10c30c30c30c30c3; + x = (x | x << 2) & 0x1249249249249249; + + y = (y | y << 32) & 0x1f00000000ffff; + y = (y | y << 16) & 0x1f0000ff0000ff; + y = (y | y << 8) & 0x100f00f00f00f00f; + y = (y | y << 4) & 0x10c30c30c30c30c3; + y = (y | y << 2) & 0x1249249249249249; + + z = (z | z << 32) & 0x1f00000000ffff; + z = (z | z << 16) & 0x1f0000ff0000ff; + z = (z | z << 8) & 0x100f00f00f00f00f; + z = (z | z << 4) & 0x10c30c30c30c30c3; + z = (z | z << 2) & 0x1249249249249249; + + return x | (y << 1) | (z << 2); + } +} diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h new file mode 100644 index 0000000000..032b56904e --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/obbox.h @@ -0,0 +1,39 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "linearspace3.h" + +namespace embree +{ + /*! Oriented bounding box */ + template + struct OBBox + { + public: + + __forceinline OBBox () {} + + __forceinline OBBox (EmptyTy) + : space(one), bounds(empty) {} + + __forceinline OBBox (const BBox& bounds) + : space(one), bounds(bounds) {} + + __forceinline OBBox (const LinearSpace3& space, const BBox& bounds) + : space(space), bounds(bounds) {} + + friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { + return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; + } + + public: + LinearSpace3 space; //!< orthonormal transformation + BBox bounds; //!< bounds in transformed space + }; + + typedef OBBox OBBox3f; + typedef OBBox OBBox3fa; +} diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h new file mode 100644 index 0000000000..20c69bc62f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/quaternion.h @@ -0,0 +1,254 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "vec4.h" + +#include "transcendental.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////// + // Quaternion Struct + //////////////////////////////////////////////////////////////// + + template + struct QuaternionT + { + typedef Vec3 Vector; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT () { } + __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } + __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } + + __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} + __forceinline explicit QuaternionT( const Vec3& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} + __forceinline explicit QuaternionT( const Vec4& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} + __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} + __forceinline QuaternionT( const T& r, const Vec3& v ) : r(r), i(v.x), j(v.y), k(v.z) {} + + __inline QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ); + __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} + __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} + + /*! return quaternion for rotation around arbitrary axis */ + static __forceinline QuaternionT rotate(const Vec3& u, const T& r) { + return QuaternionT(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); + } + + /*! returns the rotation axis of the quaternion as a vector */ + __forceinline Vec3 v( ) const { return Vec3(i, j, k); } + + public: + T r, i, j, k; + }; + + template __forceinline QuaternionT operator *( const T & a, const QuaternionT& b ) { return QuaternionT(a * b.r, a * b.i, a * b.j, a * b.k); } + template __forceinline QuaternionT operator *( const QuaternionT& a, const T & b ) { return QuaternionT(a.r * b, a.i * b, a.j * b, a.k * b); } + + //////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////// + + template __forceinline QuaternionT operator +( const QuaternionT& a ) { return QuaternionT(+a.r, +a.i, +a.j, +a.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a ) { return QuaternionT(-a.r, -a.i, -a.j, -a.k); } + template __forceinline QuaternionT conj ( const QuaternionT& a ) { return QuaternionT(a.r, -a.i, -a.j, -a.k); } + template __forceinline T abs ( const QuaternionT& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template __forceinline QuaternionT rcp ( const QuaternionT& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template __forceinline QuaternionT normalize ( const QuaternionT& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + + // evaluates a*q-r + template __forceinline QuaternionT + msub(const T& a, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(msub(a, q.r, p.r), + msub(a, q.i, p.i), + msub(a, q.j, p.j), + msub(a, q.k, p.k)); + } + // evaluates a*q-r + template __forceinline QuaternionT + madd (const T& a, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(madd(a, q.r, p.r), + madd(a, q.i, p.i), + madd(a, q.j, p.j), + madd(a, q.k, p.k)); + } + + //////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////// + + template __forceinline QuaternionT operator +( const T & a, const QuaternionT& b ) { return QuaternionT(a + b.r, b.i, b.j, b.k); } + template __forceinline QuaternionT operator +( const QuaternionT& a, const T & b ) { return QuaternionT(a.r + b, a.i, a.j, a.k); } + template __forceinline QuaternionT operator +( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } + template __forceinline QuaternionT operator -( const T & a, const QuaternionT& b ) { return QuaternionT(a - b.r, -b.i, -b.j, -b.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a, const T & b ) { return QuaternionT(a.r - b, a.i, a.j, a.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } + + template __forceinline Vec3 operator *( const QuaternionT& a, const Vec3 & b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline QuaternionT operator *( const QuaternionT& a, const QuaternionT& b ) { + return QuaternionT(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, + a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, + a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, + a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); + } + template __forceinline QuaternionT operator /( const T & a, const QuaternionT& b ) { return a*rcp(b); } + template __forceinline QuaternionT operator /( const QuaternionT& a, const T & b ) { return a*rcp(b); } + template __forceinline QuaternionT operator /( const QuaternionT& a, const QuaternionT& b ) { return a*rcp(b); } + + template __forceinline QuaternionT& operator +=( QuaternionT& a, const T & b ) { return a = a+b; } + template __forceinline QuaternionT& operator +=( QuaternionT& a, const QuaternionT& b ) { return a = a+b; } + template __forceinline QuaternionT& operator -=( QuaternionT& a, const T & b ) { return a = a-b; } + template __forceinline QuaternionT& operator -=( QuaternionT& a, const QuaternionT& b ) { return a = a-b; } + template __forceinline QuaternionT& operator *=( QuaternionT& a, const T & b ) { return a = a*b; } + template __forceinline QuaternionT& operator *=( QuaternionT& a, const QuaternionT& b ) { return a = a*b; } + template __forceinline QuaternionT& operator /=( QuaternionT& a, const T & b ) { return a = a*rcp(b); } + template __forceinline QuaternionT& operator /=( QuaternionT& a, const QuaternionT& b ) { return a = a*rcp(b); } + + template __forceinline QuaternionT + select(const M& m, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(select(m, q.r, p.r), + select(m, q.i, p.i), + select(m, q.j, p.j), + select(m, q.k, p.k)); + } + + + template __forceinline Vec3 xfmPoint ( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline Vec3 xfmVector( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline Vec3 xfmNormal( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + + template __forceinline T dot(const QuaternionT& a, const QuaternionT& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const QuaternionT& a, const QuaternionT& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } + template __forceinline bool operator !=( const QuaternionT& a, const QuaternionT& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Orientation Functions + //////////////////////////////////////////////////////////////////////////////// + + template QuaternionT::QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ) + { + if ( vx.x + vy.y + vz.z >= T(zero) ) + { + const T t = T(one) + (vx.x + vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = t*s; + i = (vy.z - vz.y)*s; + j = (vz.x - vx.z)*s; + k = (vx.y - vy.x)*s; + } + else if ( vx.x >= max(vy.y, vz.z) ) + { + const T t = (T(one) + vx.x) - (vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = (vy.z - vz.y)*s; + i = t*s; + j = (vx.y + vy.x)*s; + k = (vz.x + vx.z)*s; + } + else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) + { + const T t = (T(one) + vy.y) - (vz.z + vx.x); + const T s = rsqrt(t)*T(0.5f); + r = (vz.x - vx.z)*s; + i = (vx.y + vy.x)*s; + j = t*s; + k = (vy.z + vz.y)*s; + } + else //if ( vz.z >= max(vy.y, vx.x) ) + { + const T t = (T(one) + vz.z) - (vx.x + vy.y); + const T s = rsqrt(t)*T(0.5f); + r = (vx.y - vy.x)*s; + i = (vz.x + vx.z)*s; + j = (vy.z + vz.y)*s; + k = t*s; + } + } + + template QuaternionT::QuaternionT( const T& yaw, const T& pitch, const T& roll ) + { + const T cya = cos(yaw *T(0.5f)); + const T cpi = cos(pitch*T(0.5f)); + const T cro = cos(roll *T(0.5f)); + const T sya = sin(yaw *T(0.5f)); + const T spi = sin(pitch*T(0.5f)); + const T sro = sin(roll *T(0.5f)); + r = cro*cya*cpi + sro*sya*spi; + i = cro*cya*spi + sro*sya*cpi; + j = cro*sya*cpi - sro*cya*spi; + k = sro*cya*cpi - cro*sya*spi; + } + + ////////////////////////////////////////////////////////////////////////////// + /// Output Operators + ////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const QuaternionT& q) { + return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; + } + + /*! default template instantiations */ + typedef QuaternionT Quaternion3f; + typedef QuaternionT Quaternion3d; + + template using Quaternion3vf = QuaternionT>; + typedef QuaternionT> Quaternion3vf4; + typedef QuaternionT> Quaternion3vf8; + typedef QuaternionT> Quaternion3vf16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template + __forceinline QuaternionTlerp(const QuaternionT& q0, + const QuaternionT& q1, + const T& factor) + { + QuaternionT q; + q.r = lerp(q0.r, q1.r, factor); + q.i = lerp(q0.i, q1.i, factor); + q.j = lerp(q0.j, q1.j, factor); + q.k = lerp(q0.k, q1.k, factor); + return q; + } + + template + __forceinline QuaternionT slerp(const QuaternionT& q0, + const QuaternionT& q1_, + const T& t) + { + T cosTheta = dot(q0, q1_); + QuaternionT q1 = select(cosTheta < 0.f, -q1_, q1_); + cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); + if (unlikely(all(cosTheta > 0.9995f))) { + return normalize(lerp(q0, q1, t)); + } + const T phi = t * fastapprox::acos(cosTheta); + T sinPhi, cosPhi; + fastapprox::sincos(phi, sinPhi, cosPhi); + QuaternionT qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); + return msub(cosPhi, q0, qperp); + } +} diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h new file mode 100644 index 0000000000..762d9cd9ea --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/range.h @@ -0,0 +1,137 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../math/math.h" + +namespace embree +{ + template + struct range + { + __forceinline range() {} + + __forceinline range(const Ty& begin) + : _begin(begin), _end(begin+1) {} + + __forceinline range(const Ty& begin, const Ty& end) + : _begin(begin), _end(end) {} + + __forceinline range(const range& other) + : _begin(other._begin), _end(other._end) {} + + template + __forceinline range(const range& other) + : _begin(Ty(other._begin)), _end(Ty(other._end)) {} + + template + __forceinline range& operator =(const range& other) { + _begin = other._begin; + _end = other._end; + return *this; + } + + __forceinline Ty begin() const { + return _begin; + } + + __forceinline Ty end() const { + return _end; + } + + __forceinline range intersect(const range& r) const { + return range (max(_begin,r._begin),min(_end,r._end)); + } + + __forceinline Ty size() const { + return _end - _begin; + } + + __forceinline bool empty() const { + return _end <= _begin; + } + + __forceinline Ty center() const { + return (_begin + _end)/2; + } + + __forceinline std::pair split() const + { + const Ty _center = center(); + return std::make_pair(range(_begin,_center),range(_center,_end)); + } + + __forceinline void split(range& left_o, range& right_o) const + { + const Ty _center = center(); + left_o = range(_begin,_center); + right_o = range(_center,_end); + } + + __forceinline friend bool operator< (const range& r0, const range& r1) { + return r0.size() < r1.size(); + } + + friend embree_ostream operator<<(embree_ostream cout, const range& r) { + return cout << "range [" << r.begin() << ", " << r.end() << "]"; + } + + Ty _begin, _end; + }; + + template + range make_range(const Ty& begin, const Ty& end) { + return range(begin,end); + } + + template + struct extended_range : public range + { + __forceinline extended_range () {} + + __forceinline extended_range (const Ty& begin) + : range(begin), _ext_end(begin+1) {} + + __forceinline extended_range (const Ty& begin, const Ty& end) + : range(begin,end), _ext_end(end) {} + + __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) + : range(begin,end), _ext_end(ext_end) {} + + __forceinline Ty ext_end() const { + return _ext_end; + } + + __forceinline Ty ext_size() const { + return _ext_end - range::_begin; + } + + __forceinline Ty ext_range_size() const { + return _ext_end - range::_end; + } + + __forceinline bool has_ext_range() const { + assert(_ext_end >= range::_end); + return (_ext_end - range::_end) > 0; + } + + __forceinline void set_ext_range(const size_t ext_end){ + assert(ext_end >= range::_end); + _ext_end = ext_end; + } + + __forceinline void move_right(const size_t plus){ + range::_begin += plus; + range::_end += plus; + _ext_end += plus; + } + + friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { + return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; + } + + Ty _ext_end; + }; +} diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h new file mode 100644 index 0000000000..6855d82b53 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/transcendental.h @@ -0,0 +1,525 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Transcendental functions from "ispc": https://github.com/ispc/ispc/ +// Most of the transcendental implementations in ispc code come from +// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ + +#include "../simd/simd.h" + +namespace embree +{ + +namespace fastapprox +{ + +template +__forceinline T sin(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto flipSign = (kMod4 > 1); + + // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, + // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); + static const float sinC2 = -0.16666667163372039794921875; + static const float sinC4 = +8.333347737789154052734375e-3; + static const float sinC6 = -1.9842604524455964565277099609375e-4; + static const float sinC8 = +2.760012648650445044040679931640625e-6; + static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + static const float cosC2 = -0.5; + static const float cosC4 = +4.166664183139801025390625e-2; + static const float cosC6 = -1.388833043165504932403564453125e-3; + static const float cosC8 = +2.47562347794882953166961669921875e-5; + static const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(sinUseCos, 1., x); + auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); + auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); + auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); + auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); + auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template +__forceinline T cos(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + + auto kMod4 = k & 3; + auto cosUseCos = (kMod4 == 0 | kMod4 == 2); + auto flipSign = (kMod4 == 1 | kMod4 == 2); + + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(cosUseCos, 1., x); + auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); + auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); + auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); + auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); + auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template +__forceinline void sincos(const T &v, T &sinResult, T &cosResult) +{ + const float piOverTwoVec = 1.57079637050628662109375; + const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); + auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); + auto sinFlipSign = (kMod4 > 1); + auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); + + const float oneVec = +1.; + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto x2 = x * x; + + auto sinFormula = x2 * sinC10 + sinC8; + auto cosFormula = x2 * cosC10 + cosC8; + sinFormula = x2 * sinFormula + sinC6; + cosFormula = x2 * cosFormula + cosC6; + + sinFormula = x2 * sinFormula + sinC4; + cosFormula = x2 * cosFormula + cosC4; + + sinFormula = x2 * sinFormula + sinC2; + cosFormula = x2 * cosFormula + cosC2; + + sinFormula = x2 * sinFormula + oneVec; + cosFormula = x2 * cosFormula + oneVec; + + sinFormula *= x; + + sinResult = select(sinUseCos, cosFormula, sinFormula); + cosResult = select(cosUseCos, cosFormula, sinFormula); + + sinResult = select(sinFlipSign, -sinResult, sinResult); + cosResult = select(cosFlipSign, -cosResult, cosResult); +} + +template +__forceinline T tan(const T &v) +{ + const float piOverFourVec = 0.785398185253143310546875; + const float fourOverPiVec = 1.27323949337005615234375; + + auto xLt0 = v < 0.; + auto y = select(xLt0, -v, v); + auto scaled = y * fourOverPiVec; + + auto kReal = floor(scaled); + auto k = toInt(kReal); + + auto x = y - kReal * piOverFourVec; + + // If k & 1, x -= Pi/4 + auto needOffset = (k & 1) != 0; + x = select(needOffset, x - piOverFourVec, x); + + // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... + auto kMod4 = k & 3; + auto useCotan = (kMod4 == 1) | (kMod4 == 2); + + const float oneVec = 1.0; + + const float tanC2 = +0.33333075046539306640625; + const float tanC4 = +0.13339905440807342529296875; + const float tanC6 = +5.3348250687122344970703125e-2; + const float tanC8 = +2.46033705770969390869140625e-2; + const float tanC10 = +2.892402000725269317626953125e-3; + const float tanC12 = +9.500005282461643218994140625e-3; + + const float cotC2 = -0.3333333432674407958984375; + const float cotC4 = -2.222204394638538360595703125e-2; + const float cotC6 = -2.11752182804048061370849609375e-3; + const float cotC8 = -2.0846328698098659515380859375e-4; + const float cotC10 = -2.548247357481159269809722900390625e-5; + const float cotC12 = -3.5257363606433500535786151885986328125e-7; + + auto x2 = x * x; + T z; + if (any(useCotan)) + { + auto cotVal = x2 * cotC12 + cotC10; + cotVal = x2 * cotVal + cotC8; + cotVal = x2 * cotVal + cotC6; + cotVal = x2 * cotVal + cotC4; + cotVal = x2 * cotVal + cotC2; + cotVal = x2 * cotVal + oneVec; + // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. + cotVal /= -x; + z = cotVal; + } + auto useTan = !useCotan; + if (any(useTan)) + { + auto tanVal = x2 * tanC12 + tanC10; + tanVal = x2 * tanVal + tanC8; + tanVal = x2 * tanVal + tanC6; + tanVal = x2 * tanVal + tanC4; + tanVal = x2 * tanVal + tanC2; + tanVal = x2 * tanVal + oneVec; + // Equation was for tan(x)/x + tanVal *= x; + z = select(useTan, tanVal, z); + } + return select(xLt0, -z, z); +} + +template +__forceinline T asin(const T &x0) +{ + auto isneg = (x0 < 0.f); + auto x = abs(x0); + auto isnan = (x > 1.f); + + // sollya + // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], + // [1e-20;.9999999999999999]); + // avg error: 1.1105439e-06, max error 1.3187528e-06 + auto v = 1.57079517841339111328125f + + x * (-0.21450997889041900634765625f + + x * (8.78556668758392333984375e-2f + + x * (-4.489909112453460693359375e-2f + + x * (1.928029954433441162109375e-2f + + x * (-4.3095736764371395111083984375e-3f))))); + + v *= -sqrt(1.f - x); + v = v + 1.57079637050628662109375f; + + v = select(v < 0.f, T(0.f), v); + v = select(isneg, -v, v); + v = select(isnan, T(cast_i2f(0x7fc00000)), v); + + return v; +} + +template +__forceinline T acos(const T &v) +{ + return 1.57079637050628662109375f - asin(v); +} + +template +__forceinline T atan(const T &v) +{ + const float piOverTwoVec = 1.57079637050628662109375; + // atan(-x) = -atan(x) (so flip from negative to positive first) + // If x > 1 -> atan(x) = Pi/2 - atan(1/x) + auto xNeg = v < 0.f; + auto xFlipped = select(xNeg, -v, v); + + auto xGt1 = xFlipped > 1.; + auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); + + // These coefficients approximate atan(x)/x + const float atanC0 = +0.99999988079071044921875; + const float atanC2 = -0.3333191573619842529296875; + const float atanC4 = +0.199689209461212158203125; + const float atanC6 = -0.14015688002109527587890625; + const float atanC8 = +9.905083477497100830078125e-2; + const float atanC10 = -5.93664981424808502197265625e-2; + const float atanC12 = +2.417283318936824798583984375e-2; + const float atanC14 = -4.6721356920897960662841796875e-3; + + auto x2 = x * x; + auto result = x2 * atanC14 + atanC12; + result = x2 * result + atanC10; + result = x2 * result + atanC8; + result = x2 * result + atanC6; + result = x2 * result + atanC4; + result = x2 * result + atanC2; + result = x2 * result + atanC0; + result *= x; + + result = select(xGt1, piOverTwoVec - result, result); + result = select(xNeg, -result, result); + return result; +} + +template +__forceinline T atan2(const T &y, const T &x) +{ + const float piVec = 3.1415926536; + // atan2(y, x) = + // + // atan2(y > 0, x = +-0) -> Pi/2 + // atan2(y < 0, x = +-0) -> -Pi/2 + // atan2(y = +-0, x < +0) -> +-Pi + // atan2(y = +-0, x >= +0) -> +-0 + // + // atan2(y >= 0, x < 0) -> Pi + atan(y/x) + // atan2(y < 0, x < 0) -> -Pi + atan(y/x) + // atan2(y, x > 0) -> atan(y/x) + // + // and then a bunch of code for dealing with infinities. + auto yOverX = y * rcpSafe(x); + auto atanArg = atan(yOverX); + auto xLt0 = x < 0.f; + auto yLt0 = y < 0.f; + auto offset = select(xLt0, + select(yLt0, T(-piVec), T(piVec)), 0.f); + return offset + atanArg; +} + +template +__forceinline T exp(const T &v) +{ + const float ln2Part1 = 0.6931457519; + const float ln2Part2 = 1.4286067653e-6; + const float oneOverLn2 = 1.44269502162933349609375; + + auto scaled = v * oneOverLn2; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * ln2Part1; + x -= kReal * ln2Part2; + + // These coefficients are for e^x in [0, ln(2)] + const float one = 1.; + const float c2 = 0.4999999105930328369140625; + const float c3 = 0.166668415069580078125; + const float c4 = 4.16539050638675689697265625e-2; + const float c5 = 8.378830738365650177001953125e-3; + const float c6 = 1.304379315115511417388916015625e-3; + const float c7 = 2.7555381529964506626129150390625e-4; + + auto result = x * c7 + c6; + result = x * result + c5; + result = x * result + c4; + result = x * result + c3; + result = x * result + c2; + result = x * result + one; + result = x * result + one; + + // Compute 2^k (should differ for float and double, but I'll avoid + // it for now and just do floats) + const int fpbias = 127; + auto biasedN = k + fpbias; + auto overflow = kReal > fpbias; + // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) + // we've got underflow. -127 * ln(2) -> -88.02. So the most + // negative float input that doesn't result in zero is like -88. + auto underflow = kReal <= -fpbias; + const int infBits = 0x7f800000; + biasedN <<= 23; + // Reinterpret this thing as float + auto twoToTheN = asFloat(biasedN); + // Handle both doubles and floats (hopefully eliding the copy for float) + auto elemtype2n = twoToTheN; + result *= elemtype2n; + result = select(overflow, cast_i2f(infBits), result); + result = select(underflow, 0., result); + return result; +} + +// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n +// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). +template +__forceinline void __rangeReduceLog(const T &input, + T &reduced, + R &exponent) +{ + auto intVersion = asInt(input); + // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM + // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 + // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 + // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 + // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF + + //const int exponentMask(0x7F800000) + static const int nonexponentMask = 0x807FFFFF; + + // We want the reduced version to have an exponent of -1 which is + // -1 + 127 after biasing or 126 + static const int exponentNeg1 = (126l << 23); + // NOTE(boulos): We don't need to mask anything out since we know + // the sign bit has to be 0. If it's 1, we need to return infinity/nan + // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). + auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] + + auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 + exponent = offsetExponent - 127; // get the real value + + // Blend the offset_exponent with the original input (do this in + // int for now, until I decide if float can have & and ¬) + auto blended = (intVersion & nonexponentMask) | (exponentNeg1); + reduced = asFloat(blended); +} + +template struct ExponentType { }; +template struct ExponentType> { typedef vint Ty; }; +template <> struct ExponentType { typedef int Ty; }; + +template +__forceinline T log(const T &v) +{ + T reduced; + typename ExponentType::Ty exponent; + + const int nanBits = 0x7fc00000; + const int negInfBits = 0xFF800000; + const float nan = cast_i2f(nanBits); + const float negInf = cast_i2f(negInfBits); + auto useNan = v < 0.; + auto useInf = v == 0.; + auto exceptional = useNan | useInf; + const float one = 1.0; + + auto patched = select(exceptional, one, v); + __rangeReduceLog(patched, reduced, exponent); + + const float ln2 = 0.693147182464599609375; + + auto x1 = one - reduced; + const float c1 = +0.50000095367431640625; + const float c2 = +0.33326041698455810546875; + const float c3 = +0.2519190013408660888671875; + const float c4 = +0.17541764676570892333984375; + const float c5 = +0.3424419462680816650390625; + const float c6 = -0.599632322788238525390625; + const float c7 = +1.98442304134368896484375; + const float c8 = -2.4899270534515380859375; + const float c9 = +1.7491014003753662109375; + + auto result = x1 * c9 + c8; + result = x1 * result + c7; + result = x1 * result + c6; + result = x1 * result + c5; + result = x1 * result + c4; + result = x1 * result + c3; + result = x1 * result + c2; + result = x1 * result + c1; + result = x1 * result + one; + + // Equation was for -(ln(red)/(1-red)) + result *= -x1; + result += toFloat(exponent) * ln2; + + return select(exceptional, + select(useNan, T(nan), T(negInf)), + result); +} + +template +__forceinline T pow(const T &x, const T &y) +{ + auto x1 = abs(x); + auto z = exp(y * log(x1)); + + // Handle special cases + const float twoOver23 = 8388608.0f; + auto yInt = y == round(y); + auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit + + // x == 0 + z = select(x == 0.0f, + select(y < 0.0f, T(inf) | signmsk(x), + select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); + + // x < 0 + auto xNegative = x < 0.0f; + if (any(xNegative)) + { + auto z1 = z | asFloat(yOddInt); + z1 = select(yInt, z1, std::numeric_limits::quiet_NaN()); + z = select(xNegative, z1, z); + } + + auto xFinite = isfinite(x); + auto yFinite = isfinite(y); + if (all(xFinite & yFinite)) + return z; + + // x finite and y infinite + z = select(andn(xFinite, yFinite), + select(x1 == 1.0f, 1.0f, + select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); + + // x infinite + z = select(xFinite, z, + select(y == 0.0f, 1.0f, + select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); + + return z; +} + +template +__forceinline T pow(const T &x, float y) +{ + return pow(x, T(y)); +} + +} // namespace fastapprox + +} // namespace embree diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h new file mode 100644 index 0000000000..a619459e9c --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec2.h @@ -0,0 +1,235 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec2fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 2D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec2 + { + enum { N = 2 }; + union { + struct { T x, y; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ) {} + __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} + __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} + + __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2( const Vec2fa& other ); + + template __forceinline Vec2( const Vec2& a ) : x(T(a.x)), y(T(a.y)) {} + template __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} + __forceinline Vec2( OneTy ) : x(one), y(one) {} + __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} + __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} + +#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } + __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 operator +( const Vec2& a ) { return Vec2(+a.x, +a.y); } + template __forceinline Vec2 operator -( const Vec2& a ) { return Vec2(-a.x, -a.y); } + template __forceinline Vec2 abs ( const Vec2& a ) { return Vec2(abs (a.x), abs (a.y)); } + template __forceinline Vec2 rcp ( const Vec2& a ) { return Vec2(rcp (a.x), rcp (a.y)); } + template __forceinline Vec2 rsqrt ( const Vec2& a ) { return Vec2(rsqrt(a.x), rsqrt(a.y)); } + template __forceinline Vec2 sqrt ( const Vec2& a ) { return Vec2(sqrt (a.x), sqrt (a.y)); } + template __forceinline Vec2 frac ( const Vec2& a ) { return Vec2(frac (a.x), frac (a.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 operator +( const Vec2& a, const Vec2& b ) { return Vec2(a.x + b.x, a.y + b.y); } + template __forceinline Vec2 operator +( const Vec2& a, const T& b ) { return Vec2(a.x + b , a.y + b ); } + template __forceinline Vec2 operator +( const T& a, const Vec2& b ) { return Vec2(a + b.x, a + b.y); } + template __forceinline Vec2 operator -( const Vec2& a, const Vec2& b ) { return Vec2(a.x - b.x, a.y - b.y); } + template __forceinline Vec2 operator -( const Vec2& a, const T& b ) { return Vec2(a.x - b , a.y - b ); } + template __forceinline Vec2 operator -( const T& a, const Vec2& b ) { return Vec2(a - b.x, a - b.y); } + template __forceinline Vec2 operator *( const Vec2& a, const Vec2& b ) { return Vec2(a.x * b.x, a.y * b.y); } + template __forceinline Vec2 operator *( const T& a, const Vec2& b ) { return Vec2(a * b.x, a * b.y); } + template __forceinline Vec2 operator *( const Vec2& a, const T& b ) { return Vec2(a.x * b , a.y * b ); } + template __forceinline Vec2 operator /( const Vec2& a, const Vec2& b ) { return Vec2(a.x / b.x, a.y / b.y); } + template __forceinline Vec2 operator /( const Vec2& a, const T& b ) { return Vec2(a.x / b , a.y / b ); } + template __forceinline Vec2 operator /( const T& a, const Vec2& b ) { return Vec2(a / b.x, a / b.y); } + + template __forceinline Vec2 min(const Vec2& a, const Vec2& b) { return Vec2(min(a.x, b.x), min(a.y, b.y)); } + template __forceinline Vec2 max(const Vec2& a, const Vec2& b) { return Vec2(max(a.x, b.x), max(a.y, b.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 madd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } + template __forceinline Vec2 msub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } + template __forceinline Vec2 nmadd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } + template __forceinline Vec2 nmsub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } + + template __forceinline Vec2 madd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } + template __forceinline Vec2 msub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } + template __forceinline Vec2 nmadd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } + template __forceinline Vec2 nmsub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2& operator +=( Vec2& a, const Vec2& b ) { a.x += b.x; a.y += b.y; return a; } + template __forceinline Vec2& operator -=( Vec2& a, const Vec2& b ) { a.x -= b.x; a.y -= b.y; return a; } + template __forceinline Vec2& operator *=( Vec2& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } + template __forceinline Vec2& operator /=( Vec2& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec2& a ) { return a.x + a.y; } + template __forceinline T reduce_mul( const Vec2& a ) { return a.x * a.y; } + template __forceinline T reduce_min( const Vec2& a ) { return min(a.x, a.y); } + template __forceinline T reduce_max( const Vec2& a ) { return max(a.x, a.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec2& a, const Vec2& b ) { return a.x == b.x && a.y == b.y; } + template __forceinline bool operator !=( const Vec2& a, const Vec2& b ) { return a.x != b.x || a.y != b.y; } + template __forceinline bool operator < ( const Vec2& a, const Vec2& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 shift_right_1( const Vec2& a ) { + return Vec2(shift_right_1(a.x),shift_right_1(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T dot ( const Vec2& a, const Vec2& b ) { return madd(a.x,b.x,a.y*b.y); } + template __forceinline Vec2 cross ( const Vec2& a ) { return Vec2(-a.y,a.x); } + template __forceinline T length ( const Vec2& a ) { return sqrt(dot(a,a)); } + template __forceinline Vec2 normalize( const Vec2& a ) { return a*rsqrt(dot(a,a)); } + template __forceinline T distance ( const Vec2& a, const Vec2& b ) { return length(a-b); } + template __forceinline T det ( const Vec2& a, const Vec2& b ) { return a.x*b.y - a.y*b.x; } + + template __forceinline Vec2 normalize_safe( const Vec2& a ) { + const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 select ( bool s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template __forceinline Vec2 select ( const Vec2& s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); + } + + template __forceinline Vec2 select ( const typename T::Bool& s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template + __forceinline Vec2 lerp(const Vec2& v0, const Vec2& v1, const T& t) { + return madd(Vec2(T(1.0f)-t),v0,t*v1); + } + + template __forceinline int maxDim ( const Vec2& a ) + { + const Vec2 b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec2 Vec2b; + typedef Vec2 Vec2i; + typedef Vec2 Vec2f; +} + +#include "vec2fa.h" + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined(__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} + +#if defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX__) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h new file mode 100644 index 0000000000..451ecd556c --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec2fa.h @@ -0,0 +1,317 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec2fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec2fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 2 }; + union { + __m128 m128; + struct { float x,y,az,aw; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ) {} + __forceinline Vec2fa( const __m128 a ) : m128(a) {} + + __forceinline Vec2fa ( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2fa& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } + __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} + + __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec2fa load( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline Vec2fa loadu( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { + _mm_storeu_ps((float*)ptr,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } + __forceinline Vec2fa operator -( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec2fa abs ( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec2fa sign ( const Vec2fa& a ) { + return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); + } + + __forceinline Vec2fa rcp ( const Vec2fa& a ) + { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec2fa)reciprocal; +#else +#if defined(__AVX512VL__) + const Vec2fa r = _mm_rcp14_ps(a.m128); +#else + const Vec2fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; +#endif //defined(__aarch64__) + } + + __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } + + __forceinline Vec2fa rsqrt( const Vec2fa& a ) + { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif + } + + __forceinline Vec2fa zero_fix(const Vec2fa& a) { + return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec2fa rcp_safe(const Vec2fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec2fa log ( const Vec2fa& a ) { + return Vec2fa(logf(a.x),logf(a.y)); + } + + __forceinline Vec2fa exp ( const Vec2fa& a ) { + return Vec2fa(expf(a.x),expf(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } + __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } + __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { + return Vec2fa(powf(a.x,b),powf(a.y,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } +#endif + + __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } + __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } + __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } + __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } + __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } + __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } + __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } + __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } + __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); + } +#else + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec2fa cross ( const Vec2fa& a ) { + return Vec2fa(-a.y,a.x); + } + + __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec2fa& a ) + { + const Vec2fa b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) +__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } +__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } +//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } +#elif defined (__SSE4_1__) + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +#else + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + typedef Vec2fa Vec2fa_t; +} diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h new file mode 100644 index 0000000000..1870321715 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3.h @@ -0,0 +1,349 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec3fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 3D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec3 + { + enum { N = 3 }; + + union { + struct { + T x, y, z; + }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ) {} + __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} + __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} + + __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } + __forceinline Vec3( const Vec3fa& other ); + + template __forceinline Vec3( const Vec3& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} + template __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} + __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} + __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} + __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } + __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 operator +( const Vec3& a ) { return Vec3(+a.x, +a.y, +a.z); } + template __forceinline Vec3 operator -( const Vec3& a ) { return Vec3(-a.x, -a.y, -a.z); } + template __forceinline Vec3 abs ( const Vec3& a ) { return Vec3(abs (a.x), abs (a.y), abs (a.z)); } + template __forceinline Vec3 rcp ( const Vec3& a ) { return Vec3(rcp (a.x), rcp (a.y), rcp (a.z)); } + template __forceinline Vec3 rsqrt ( const Vec3& a ) { return Vec3(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } + template __forceinline Vec3 sqrt ( const Vec3& a ) { return Vec3(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } + + template __forceinline Vec3 zero_fix( const Vec3& a ) + { + return Vec3(select(abs(a.x) __forceinline Vec3 rcp_safe(const Vec3& a) { return rcp(zero_fix(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 operator +( const Vec3& a, const Vec3& b ) { return Vec3(a.x + b.x, a.y + b.y, a.z + b.z); } + template __forceinline Vec3 operator -( const Vec3& a, const Vec3& b ) { return Vec3(a.x - b.x, a.y - b.y, a.z - b.z); } + template __forceinline Vec3 operator *( const Vec3& a, const Vec3& b ) { return Vec3(a.x * b.x, a.y * b.y, a.z * b.z); } + template __forceinline Vec3 operator *( const T& a, const Vec3& b ) { return Vec3(a * b.x, a * b.y, a * b.z); } + template __forceinline Vec3 operator *( const Vec3& a, const T& b ) { return Vec3(a.x * b , a.y * b , a.z * b ); } + template __forceinline Vec3 operator /( const Vec3& a, const T& b ) { return Vec3(a.x / b , a.y / b , a.z / b ); } + template __forceinline Vec3 operator /( const T& a, const Vec3& b ) { return Vec3(a / b.x, a / b.y, a / b.z); } + template __forceinline Vec3 operator /( const Vec3& a, const Vec3& b ) { return Vec3(a.x / b.x, a.y / b.y, a.z / b.z); } + + template __forceinline Vec3 min(const Vec3& a, const Vec3& b) { return Vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } + template __forceinline Vec3 max(const Vec3& a, const Vec3& b) { return Vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } + + template __forceinline Vec3 operator >>( const Vec3& a, const int b ) { return Vec3(a.x >> b, a.y >> b, a.z >> b); } + template __forceinline Vec3 operator <<( const Vec3& a, const int b ) { return Vec3(a.x << b, a.y << b, a.z << b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 madd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } + template __forceinline Vec3 msub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } + template __forceinline Vec3 nmadd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} + template __forceinline Vec3 nmsub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } + + template __forceinline Vec3 madd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } + template __forceinline Vec3 msub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } + template __forceinline Vec3 nmadd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} + template __forceinline Vec3 nmsub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3& operator +=( Vec3& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } + template __forceinline Vec3& operator +=( Vec3& a, const Vec3& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } + template __forceinline Vec3& operator -=( Vec3& a, const Vec3& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } + template __forceinline Vec3& operator *=( Vec3& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } + template __forceinline Vec3& operator /=( Vec3& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec3& a ) { return a.x + a.y + a.z; } + template __forceinline T reduce_mul( const Vec3& a ) { return a.x * a.y * a.z; } + template __forceinline T reduce_min( const Vec3& a ) { return min(a.x, a.y, a.z); } + template __forceinline T reduce_max( const Vec3& a ) { return max(a.x, a.y, a.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec3& a, const Vec3& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } + template __forceinline bool operator !=( const Vec3& a, const Vec3& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } + template __forceinline bool operator < ( const Vec3& a, const Vec3& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 shift_right_1( const Vec3& a ) { + return Vec3(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 select ( bool s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template __forceinline Vec3 select ( const Vec3& s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); + } + + template __forceinline Vec3 select ( const typename T::Bool& s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template + __forceinline Vec3 lerp(const Vec3& v0, const Vec3& v1, const T& t) { + return madd(Vec3(T(1.0f)-t),v0,t*v1); + } + + template __forceinline int maxDim ( const Vec3& a ) + { + const Vec3 b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 eq_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x==b.x,a.y==b.y,a.z==b.z); } + template __forceinline Vec3 neq_mask(const Vec3& a, const Vec3& b ) { return Vec3(a.x!=b.x,a.y!=b.y,a.z!=b.z); } + template __forceinline Vec3 lt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x< b.x,a.y< b.y,a.z< b.z); } + template __forceinline Vec3 le_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x<=b.x,a.y<=b.y,a.z<=b.z); } + template __forceinline Vec3 gt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x> b.x,a.y> b.y,a.z> b.z); } + template __forceinline Vec3 ge_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x>=b.x,a.y>=b.y,a.z>=b.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T sqr ( const Vec3& a ) { return dot(a,a); } + template __forceinline T dot ( const Vec3& a, const Vec3& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } + template __forceinline T length ( const Vec3& a ) { return sqrt(sqr(a)); } + template __forceinline T rcp_length( const Vec3& a ) { return rsqrt(sqr(a)); } + template __forceinline Vec3 normalize( const Vec3& a ) { return a*rsqrt(sqr(a)); } + template __forceinline T distance ( const Vec3& a, const Vec3& b ) { return length(a-b); } + template __forceinline Vec3 cross ( const Vec3& a, const Vec3& b ) { return Vec3(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); } + template __forceinline Vec3 stable_triangle_normal( const Vec3& a, const Vec3& b, const Vec3& c ) + { + const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; + const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; + const Vec3 cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); + const Vec3 cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); + const auto sx = abs(ab_x) < abs(bc_x); + const auto sy = abs(ab_y) < abs(bc_y); + const auto sz = abs(ab_z) < abs(bc_z); + return Vec3(select(sx,cross_ab.x,cross_bc.x), + select(sy,cross_ab.y,cross_bc.y), + select(sz,cross_ab.z,cross_bc.z)); + } + + template __forceinline T sum ( const Vec3& a ) { return a.x+a.y+a.z; } + + template __forceinline T halfArea ( const Vec3& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + template __forceinline T area ( const Vec3& d ) { return 2.0f*halfArea(d); } + + template __forceinline Vec3 normalize_safe( const Vec3& a ) { + const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); + } + + template __forceinline T sqr_point_to_line_distance(const Vec3& P, const Vec3& Q0, const Vec3& Q1) + { + const Vec3 N = cross(P-Q0,Q1-Q0); + const Vec3 D = Q1-Q0; + return dot(N,N)*rcp(dot(D,D)); + } + + template __forceinline T sqr_point_to_line_distance(const Vec3& PmQ0, const Vec3& Q1mQ0) + { + const Vec3 N = cross(PmQ0,Q1mQ0); + const Vec3 D = Q1mQ0; + return dot(N,N)*rcp(dot(D,D)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3 Vec3b; + typedef Vec3 Vec3i; + typedef Vec3 Vec3f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined(__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); + } + + template<> __forceinline Vec3::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } + +#if defined(__AVX__) + template<> __forceinline Vec3::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } +#elif defined(__SSE__) || defined(__ARM_NEON) + template<> + __forceinline Vec3::Vec3(const Vec3fa& a) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); + } +#endif + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template + __forceinline Vec3 shuffle(const Vec3& b) { + return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); + } +#endif + +#if defined(__AVX__) + template<> + __forceinline Vec3::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } + __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template + __forceinline Vec3 shuffle(const Vec3& b) { + return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec3::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h new file mode 100644 index 0000000000..90f31739c2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3ba.h @@ -0,0 +1,120 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ba Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ba + { + ALIGNED_STRUCT_(16); + + union { + __m128 m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( ) {} + __forceinline Vec3ba( const __m128 input ) : m128(input) {} + __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} + __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ba( bool a ) + : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline Vec3ba( bool a, bool b, bool c) + : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } + __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } + __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } + __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } + __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; + } + __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; + } + __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } + __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } + + __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } + __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } + __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } + + __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { + return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h new file mode 100644 index 0000000000..6163cfb596 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3fa.h @@ -0,0 +1,810 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ) {} + __forceinline Vec3fa( const __m128 a ) : m128(a) {} + + __forceinline Vec3fa ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fa& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } + __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fa load( const void* const a ) { +#if defined(__aarch64__) + __m128 t = _mm_load_ps((float*)a); + t[3] = 0.0f; + return Vec3fa(t); +#else + return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); +#endif + } + + static __forceinline Vec3fa loadu( const void* const a ) { + return Vec3fa(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } + __forceinline Vec3fa operator -( const Vec3fa& a ) { +#if defined(__aarch64__) + return vnegq_f32(a.m128); +#else + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + + return _mm_xor_ps(a.m128, mask); +#endif + } + __forceinline Vec3fa abs ( const Vec3fa& a ) { +#if defined(__aarch64__) + return _mm_abs_ps(a.m128); +#else + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); +#endif + } + __forceinline Vec3fa sign ( const Vec3fa& a ) { +#if defined(__aarch64__) + Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f))); + return r; +#else + return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); +#endif + } + + __forceinline Vec3fa rcp ( const Vec3fa& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + return vdivq_f32(vdupq_n_f32(1.0f),a.m128); +#elif defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec3fa)reciprocal; +#else + +#if defined(__AVX512VL__) + const Vec3fa r = _mm_rcp14_ps(a.m128); +#else + const Vec3fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; +#endif //defined(__aarch64__) + } + + __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fa rsqrt( const Vec3fa& a ) + { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + } + + __forceinline Vec3fa zero_fix(const Vec3fa& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fa rcp_safe(const Vec3fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fa log ( const Vec3fa& a ) { + return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fa exp ( const Vec3fa& a ) { + return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } + __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } + __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { + return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + +#if defined(__aarch64__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c; + } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c; + } + __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128); + return -t; + } + __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c + } + +#else + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } +#endif + +#endif + + __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } + __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } + __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } + __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } + __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline float reduce_add(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = 0.0f; + return vaddvq_f32(t); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vminvq_f32(t); + } + __forceinline float reduce_max(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vmaxvq_f32(t); + } +#else + __forceinline float reduce_add(const Vec3fa& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } + #if defined(__aarch64__) + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } +#else + __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +#endif + + __forceinline bool isvalid ( const Vec3fa& v ) { + return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fa& a ) { + return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fa& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fa& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1))); + } + + __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fa& a ) + { + const Vec3fa b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } + __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } +#elif defined (__SSE4_1__) + __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3fa Vec3fa_t; + + + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fx Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fx + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; union { int a; unsigned u; float w; }; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ) {} + __forceinline Vec3fx( const __m128 a ) : m128(a) {} + + __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} + __forceinline operator Vec3fa () const { return Vec3fa(m128); } + + __forceinline explicit Vec3fx ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fx& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } + + __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } + __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__aarch64__) + m128 = other.m128; m128[3] = w1; +#elif defined (__SSE4_1__) + m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); +#else + const vint4 mask(-1,-1,-1,0); + m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); +#endif + } + //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! + //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! + __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} + + //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fx load( const void* const a ) { + return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fx loadu( const void* const a ) { + return Vec3fx(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } + __forceinline Vec3fx operator -( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fx abs ( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fx sign ( const Vec3fx& a ) { + return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); + } + + __forceinline Vec3fx rcp ( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + const Vec3fx r = _mm_rcp14_ps(a.m128); +#else + const Vec3fx r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fx rsqrt( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fx zero_fix(const Vec3fx& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fx rcp_safe(const Vec3fx& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fx log ( const Vec3fx& a ) { + return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fx exp ( const Vec3fx& a ) { + return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } + __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } + __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { + return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } +#endif + + __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } + __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } + __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } + __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } + __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fx& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fx& v ) { + return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fx& a ) { + return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fx& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fx& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fx& a ) + { + const Vec3fx b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE4_1__) && !defined(__aarch64__) + __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + + typedef Vec3fx Vec3ff; +} diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h new file mode 100644 index 0000000000..737f67fd72 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3ia.h @@ -0,0 +1,210 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ia Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ia + { + ALIGNED_STRUCT_(16); + + union { + __m128i m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ) {} + __forceinline Vec3ia( const __m128i a ) : m128(a) {} + __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} + __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} + __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} + __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} + + __forceinline operator const __m128i&() const { return m128; } + __forceinline operator __m128i&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} + __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} + __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} + __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } + __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } +#if (defined(__aarch64__)) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } +#elif defined(__SSSE3__) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } + __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } + + __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } + __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } + __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } +#endif + + __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } + __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } + __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } + + __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } + __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } + __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } + + __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } + __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } + __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } + +#if !defined(__ARM_NEON) + __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } + __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } + + __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } + __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } + __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } + __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } + + __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } + __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } + __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } +#endif + + __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } + __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } + + __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } + __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } + +#if !defined(__ARM_NEON) + __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } + __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline int reduce_add(const Vec3ia& v) { + int32x4_t t = v.m128; + t[3] = 0; + return vaddvq_s32(t); + + } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { + int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0); + return vminvq_s32(t); + + } + __forceinline int reduce_max(const Vec3ia& v) { + int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0); + return vmaxvq_s32(t); + + } +#else + __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } + __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } + __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } + __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } +#else + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h new file mode 100644 index 0000000000..d16542f507 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec4.h @@ -0,0 +1,258 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" +#include "vec3.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Generic 4D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec4 + { + enum { N = 4 }; + union { + struct { T x, y, z, w; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ) {} + __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} + __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} + __forceinline Vec4( const Vec3& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + + __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } + __forceinline Vec4( const Vec3fx& other ); + + template __forceinline Vec4( const Vec4& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} + template __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline operator Vec3 () const { return Vec3(x,y,z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} + __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} + __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} + __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Swizzles + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3 xyz() const { return Vec3(x, y, z); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 operator +( const Vec4& a ) { return Vec4(+a.x, +a.y, +a.z, +a.w); } + template __forceinline Vec4 operator -( const Vec4& a ) { return Vec4(-a.x, -a.y, -a.z, -a.w); } + template __forceinline Vec4 abs ( const Vec4& a ) { return Vec4(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } + template __forceinline Vec4 rcp ( const Vec4& a ) { return Vec4(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } + template __forceinline Vec4 rsqrt ( const Vec4& a ) { return Vec4(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } + template __forceinline Vec4 sqrt ( const Vec4& a ) { return Vec4(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 operator +( const Vec4& a, const Vec4& b ) { return Vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + template __forceinline Vec4 operator -( const Vec4& a, const Vec4& b ) { return Vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } + template __forceinline Vec4 operator *( const Vec4& a, const Vec4& b ) { return Vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + template __forceinline Vec4 operator *( const T& a, const Vec4& b ) { return Vec4(a * b.x, a * b.y, a * b.z, a * b.w); } + template __forceinline Vec4 operator *( const Vec4& a, const T& b ) { return Vec4(a.x * b , a.y * b , a.z * b , a.w * b ); } + template __forceinline Vec4 operator /( const Vec4& a, const Vec4& b ) { return Vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } + template __forceinline Vec4 operator /( const Vec4& a, const T& b ) { return Vec4(a.x / b , a.y / b , a.z / b , a.w / b ); } + template __forceinline Vec4 operator /( const T& a, const Vec4& b ) { return Vec4(a / b.x, a / b.y, a / b.z, a / b.w); } + + template __forceinline Vec4 min(const Vec4& a, const Vec4& b) { return Vec4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } + template __forceinline Vec4 max(const Vec4& a, const Vec4& b) { return Vec4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 madd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } + template __forceinline Vec4 msub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } + template __forceinline Vec4 nmadd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } + template __forceinline Vec4 nmsub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } + + template __forceinline Vec4 madd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } + template __forceinline Vec4 msub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } + template __forceinline Vec4 nmadd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } + template __forceinline Vec4 nmsub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4& operator +=( Vec4& a, const Vec4& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } + template __forceinline Vec4& operator -=( Vec4& a, const Vec4& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } + template __forceinline Vec4& operator *=( Vec4& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } + template __forceinline Vec4& operator /=( Vec4& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec4& a ) { return a.x + a.y + a.z + a.w; } + template __forceinline T reduce_mul( const Vec4& a ) { return a.x * a.y * a.z * a.w; } + template __forceinline T reduce_min( const Vec4& a ) { return min(a.x, a.y, a.z, a.w); } + template __forceinline T reduce_max( const Vec4& a ) { return max(a.x, a.y, a.z, a.w); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec4& a, const Vec4& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } + template __forceinline bool operator !=( const Vec4& a, const Vec4& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } + template __forceinline bool operator < ( const Vec4& a, const Vec4& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + if (a.w != b.w) return a.w < b.w; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 shift_right_1( const Vec4& a ) { + return Vec4(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T dot ( const Vec4& a, const Vec4& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } + + template __forceinline T length ( const Vec4& a ) { return sqrt(dot(a,a)); } + template __forceinline Vec4 normalize( const Vec4& a ) { return a*rsqrt(dot(a,a)); } + template __forceinline T distance ( const Vec4& a, const Vec4& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 select ( bool s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template __forceinline Vec4 select ( const Vec4& s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); + } + + template __forceinline Vec4 select ( const typename T::Bool& s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template + __forceinline Vec4 lerp(const Vec4& v0, const Vec4& v1, const T& t) { + return madd(Vec4(T(1.0f)-t),v0,t*v1); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec4 Vec4b; + typedef Vec4 Vec4uc; + typedef Vec4 Vec4i; + typedef Vec4 Vec4f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined __AVX512F__ +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } + +#if defined(__AVX__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#elif defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); + } +#endif + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { + return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); + } +#endif + +#if defined(__AVX__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } + __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { + return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); + } + __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { + return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); + } + __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { + return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h new file mode 100644 index 0000000000..c840e41805 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/avx.h @@ -0,0 +1,34 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "sse.h" + +#if defined(__AVX512VL__) +#include "vboolf8_avx512.h" +#include "vboold4_avx512.h" +#else +#include "vboolf8_avx.h" +#include "vboold4_avx.h" +#endif + +#if defined(__AVX2__) +#include "vint8_avx2.h" +#include "vuint8_avx2.h" +#if defined(__X86_64__) +#include "vllong4_avx2.h" +#endif +#else +#include "vint8_avx.h" +#include "vuint8_avx.h" +#endif +#include "vfloat8_avx.h" +#if defined(__X86_64__) +#include "vdouble4_avx.h" +#endif + +#if defined(__AVX512F__) +#include "avx512.h" +#endif + diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h new file mode 100644 index 0000000000..25414ab5b1 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/avx512.h @@ -0,0 +1,41 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../math/constants.h" +#include "../sys/alloc.h" +#include "varying.h" + +#include "vboolf16_avx512.h" +#include "vint16_avx512.h" +#include "vuint16_avx512.h" +#include "vfloat16_avx512.h" + +#include "vboold8_avx512.h" +#include "vllong8_avx512.h" +#include "vdouble8_avx512.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Prefetching + //////////////////////////////////////////////////////////////////////////////// + +#define PFHINT_L1 0 +#define PFHINT_L2 1 +#define PFHINT_NT 2 + + template + __forceinline void prefetch(const void * __restrict__ const m) + { + if (mode == PFHINT_L1) + _mm_prefetch((const char*)m,_MM_HINT_T0); + else if (mode == PFHINT_L2) + _mm_prefetch((const char*)m,_MM_HINT_T1); + else if (mode == PFHINT_NT) + _mm_prefetch((const char*)m,_MM_HINT_NTA); + } +} diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h new file mode 100644 index 0000000000..647851110b --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/simd.h @@ -0,0 +1,110 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +/* include SSE wrapper classes */ +#if defined(__SSE__) || defined(__ARM_NEON) +# include "sse.h" +#endif + +/* include AVX wrapper classes */ +#if defined(__AVX__) +# include "avx.h" +#endif + +/* include AVX512 wrapper classes */ +#if defined (__AVX512F__) +# include "avx512.h" +#endif + +namespace embree +{ + template + __forceinline vbool isfinite(const vfloat& v) + { + return (v >= vfloat(-std::numeric_limits::max())) + & (v <= vfloat( std::numeric_limits::max())); + } + + /* foreach unique */ + template + __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i); + } + } + + /* returns the next unique value i in vi and the corresponding valid_i mask */ + template + __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return i; + } + + /* foreach unique index */ + template + __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i, j); + } + } + + /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ + template + __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return j; + } + + template + __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) + { + __aligned(64) int U[2*VSIZEX]; + __aligned(64) int V[2*VSIZEX]; + int index = 0; + for (int y=y0; y=y1; + const vintx vy = y; + for (int x=x0; x= x1; + vintx vx = x+vintx(step); + vintx::storeu(&U[index], vx); + vintx::storeu(&V[index], vy); + const int dx = min(x1-x,VSIZEX); + index += dx; + x += dx; + if (index >= VSIZEX || (lastx && lasty)) { + const vboolx valid = vintx(step) < vintx(index); + closure(valid, vintx::load(U), vintx::load(V)); + x-= max(0, index-VSIZEX); + index = 0; + } + } + } + } +} diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp new file mode 100644 index 0000000000..1732cfa421 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/sse.cpp @@ -0,0 +1,34 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sse.h" + +namespace embree +{ + const __m128 mm_lookupmask_ps[16] = { + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) + }; + + const __m128d mm_lookupmask_pd[4] = { + _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) + }; + +} diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h new file mode 100644 index 0000000000..6bc818b55b --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/sse.h @@ -0,0 +1,35 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../sys/alloc.h" +#include "../math/constants.h" +#include "varying.h" + +namespace embree +{ +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_blendv_ps(f,t,mask); + } +#else + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); + } +#endif + + extern const __m128 mm_lookupmask_ps[16]; + extern const __m128d mm_lookupmask_pd[4]; +} + +#if defined(__AVX512VL__) +#include "vboolf4_avx512.h" +#else +#include "vboolf4_sse2.h" +#endif +#include "vint4_sse2.h" +#include "vuint4_sse2.h" +#include "vfloat4_sse2.h" diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h new file mode 100644 index 0000000000..9a46817da9 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/varying.h @@ -0,0 +1,132 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +namespace embree +{ + /* Varying numeric types */ + template + struct vfloat + { + union { float f[N]; int i[N]; }; + __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template + struct vdouble + { + union { double f[N]; long long i[N]; }; + __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template + struct vint + { + int i[N]; + __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template + struct vuint + { + unsigned int i[N]; + __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template + struct vllong + { + long long i[N]; + __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + /* Varying bool types */ + template struct vboolf { int i[N]; }; // for float/int + template struct vboold { long long i[N]; }; // for double/long long + + /* Aliases to default types */ + template using vreal = vfloat; + template using vbool = vboolf; + + /* Varying size constants */ +#if defined(__AVX512VL__) // SKX + const int VSIZEX = 8; // default size + const int VSIZEL = 16; // large size +#elif defined(__AVX512F__) // KNL + const int VSIZEX = 16; + const int VSIZEL = 16; +#elif defined(__AVX__) + const int VSIZEX = 8; + const int VSIZEL = 8; +#else + const int VSIZEX = 4; + const int VSIZEL = 4; +#endif + + /* Extends varying size N to optimal or up to max(N, N2) */ + template + struct vextend + { +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */ + static const int size = (N2 == VSIZEX) ? VSIZEX : N; + #define SIMD_MODE(N) N, 16 +#else + /* calculate with same SIMD width otherwise */ + static const int size = N; + #define SIMD_MODE(N) N, N +#endif + }; + + /* 4-wide shortcuts */ + typedef vfloat<4> vfloat4; + typedef vdouble<4> vdouble4; + typedef vreal<4> vreal4; + typedef vint<4> vint4; + typedef vuint<4> vuint4; + typedef vllong<4> vllong4; + typedef vbool<4> vbool4; + typedef vboolf<4> vboolf4; + typedef vboold<4> vboold4; + + /* 8-wide shortcuts */ + typedef vfloat<8> vfloat8; + typedef vdouble<8> vdouble8; + typedef vreal<8> vreal8; + typedef vint<8> vint8; + typedef vuint<8> vuint8; + typedef vllong<8> vllong8; + typedef vbool<8> vbool8; + typedef vboolf<8> vboolf8; + typedef vboold<8> vboold8; + + /* 16-wide shortcuts */ + typedef vfloat<16> vfloat16; + typedef vdouble<16> vdouble16; + typedef vreal<16> vreal16; + typedef vint<16> vint16; + typedef vuint<16> vuint16; + typedef vllong<16> vllong16; + typedef vbool<16> vbool16; + typedef vboolf<16> vboolf16; + typedef vboold<16> vboold16; + + /* Default shortcuts */ + typedef vfloat vfloatx; + typedef vdouble vdoublex; + typedef vreal vrealx; + typedef vint vintx; + typedef vuint vuintx; + typedef vllong vllongx; + typedef vbool vboolx; + typedef vboolf vboolfx; + typedef vboold vbooldx; +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h new file mode 100644 index 0000000000..6505ee56f3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h @@ -0,0 +1,160 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX bool type for 64bit data types*/ + template<> + struct vboold<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + struct { __m128d vl,vh; }; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& a) { v = a.v; } + __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } + + __forceinline vboold(__m256d a) : v(a) {} + __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} + + __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } + __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } + __forceinline operator const __m256d() const { return v; } + + __forceinline vboold(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi64x(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); +#else + vl = mm_lookupmask_pd[a & 0x3]; + vh = mm_lookupmask_pd[a >> 2]; +#endif + } + + __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} +#if !defined(__aarch64__) + __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} +#else + __forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } + + __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { + return _mm256_blendv_pd(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + +#if !defined(__aarch64__) + __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } + __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } +#endif + +#if defined(__AVX2__) + template + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } + + __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } + __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } + + __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } + __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h new file mode 100644 index 0000000000..4fe730d713 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h @@ -0,0 +1,140 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboold<4> + { + typedef vboold4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& t) { v = t.v; } + __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x0) {} + __forceinline vboold(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold4& a) { return a.v == 0xf; } + __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h new file mode 100644 index 0000000000..fdf3f00de5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h @@ -0,0 +1,148 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboold<8> + { + typedef vboold8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold8& t) { v = t.v; } + __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8& t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { +#if defined(__AVX512BW__) + return _mm_movm_epi8(v); +#else + const __m512i f = _mm512_set1_epi64(0); + const __m512i t = _mm512_set1_epi64(-1); + const __m512i m = _mm512_mask_or_epi64(f,v,t,t); + return _mm512_cvtepi64_epi8(m); +#endif + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { +#if defined(__AVX512DQ__) + return _mm512_movm_epi64(v); +#else + const __m512i f = _mm512_set1_epi64(0); + const __m512i t = _mm512_set1_epi64(-1); + return _mm512_mask_or_epi64(f,v,t,t); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x00) {} + __forceinline vboold(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } + __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } + __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + + __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } + __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } + __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold8& a) { return a.v == 0xff; } + __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } + __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } + __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h new file mode 100644 index 0000000000..238cdc8eb9 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h @@ -0,0 +1,150 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 bool type */ + template<> + struct vboolf<16> + { + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + __mmask16 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf16& t) { v = t.v; } + __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask16& t) { v = t; } + __forceinline operator __mmask16() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } + __forceinline vboolf(int t) { v = (__mmask16)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { +#if defined(__AVX512BW__) + return _mm_movm_epi8(v); +#else + const __m512i f = _mm512_set1_epi32(0); + const __m512i t = _mm512_set1_epi32(-1); + const __m512i m = _mm512_mask_or_epi32(f,v,t,t); + return _mm512_cvtepi32_epi8(m); +#endif + } + + /* return int32 mask */ + __forceinline __m512i mask32() const { +#if defined(__AVX512DQ__) + return _mm512_movm_epi32(v); +#else + const __m512i f = _mm512_set1_epi32(0); + const __m512i t = _mm512_set1_epi32(-1); + return _mm512_mask_or_epi32(f,v,t,t); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0000) {} + __forceinline vboolf(TrueTy) : v(0xffff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 16); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } + __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } + __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } + + __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } + __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } + __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } + __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { + return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } + __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } + __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } + + __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } + __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } + __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Convertion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } + __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } + __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) + { + cout << "<"; + for (size_t i=0; i<16; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h new file mode 100644 index 0000000000..2ae4c4470e --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h @@ -0,0 +1,143 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboolf<4> + { + typedef vboolf4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& t) { v = t.v; } + __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0) {} + __forceinline vboolf(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf4& a) { return a.v == 0xf; } + __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h new file mode 100644 index 0000000000..ed53b3c783 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h @@ -0,0 +1,198 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide SSE bool type */ + template<> + struct vboolf<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& other) { v = other.v; } + __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } + + __forceinline vboolf(__m128 input) : v(input) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator const __m128i() const { return _mm_castps_si128(v); } + __forceinline operator const __m128d() const { return _mm_castps_pd(v); } + + __forceinline vboolf(bool a) + : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b) + : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; } +#else + __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } +#endif + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_castps_si128(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { return i[index]; } +#else + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + + __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } + +#if defined(__aarch64__) + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); + } + + template + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } +#endif + + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return shuffle(v); + } + +#if defined(__SSE3__) + template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } +#endif + +#if defined(__SSE4_1__) && !defined(__aarch64__) + template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert(a, b); } + template __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert(a, vboolf4(b)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } + __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } + + __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } + __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } + __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } + + __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } +#if defined(__aarch64__) && defined(BUILD_IOS) +__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); } +#else +#if defined(__SSE4_2__) + __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } +#else + __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } + __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h new file mode 100644 index 0000000000..4f64741b55 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h @@ -0,0 +1,189 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX bool type */ + template<> + struct vboolf<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256 v; + struct { __m128 vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& a) { v = a.v; } + __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } + + __forceinline vboolf(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } + __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } + + __forceinline vboolf(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi32(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); +#else + vl = mm_lookupmask_ps[a & 0xF]; + vh = mm_lookupmask_ps[a >> 4]; +#endif + } + + __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} + + __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} + __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_castps_si256(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} +#if !defined(__aarch64__) + __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} +#else + __forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {} +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } + + __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { + return _mm256_blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } + + template + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vboolf8 shuffle4(const vboolf8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + template __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } + template __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + + __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } + + __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } + __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h new file mode 100644 index 0000000000..2a52b554c7 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h @@ -0,0 +1,143 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboolf<8> + { + typedef vboolf8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& t) { v = t.v; } + __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) + : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x00) {} + __forceinline vboolf(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf8& a) { return a.v == 0xff; } + __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h new file mode 100644 index 0000000000..1f65b45d7e --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h @@ -0,0 +1,324 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX 64-bit double type */ + template<> + struct vdouble<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + double i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble4& t) { v = t.v; } + __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m256d& t) { v = t; } + __forceinline operator __m256d() const { return v; } + + __forceinline vdouble(double i) { + v = _mm256_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm256_set_pd(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { + _mm256_stream_pd(ptr, a); + } + + static __forceinline vdouble4 loadu(const double* addr) { + return _mm256_loadu_pd(addr); + } + + static __forceinline vdouble4 load(const vdouble4* addr) { + return _mm256_load_pd((double*)addr); + } + + static __forceinline vdouble4 load(const double* addr) { + return _mm256_load_pd(addr); + } + + static __forceinline void store(double* ptr, const vdouble4& v) { + _mm256_store_pd(ptr, v); + } + + static __forceinline void storeu(double* ptr, const vdouble4& v) { + _mm256_storeu_pd(ptr, v); + } + + static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } + __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } +#endif + + __forceinline vdouble4 operator +(const vdouble4& a) { return a; } + __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } + __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } + __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } + + __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } + __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } + __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } + + __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } + __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } + __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } + + __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } + __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } + __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } + + __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } + __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } + __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } + + __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } + __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } + __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } + + __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } + __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } + __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } + + __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } + __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } + __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__FMA__) + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } +#else + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } + __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } + + __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } + __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } + + __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } + __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } + + __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } + __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } + + __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } + __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } +#elif !defined(__aarch64__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } +#else + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); } +#endif + + __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } + __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } + + __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } + __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } + + __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } + __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } + + __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } + __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } + + __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } + __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } + + __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } + __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } + + __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } + __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } + __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } + __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } + __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } + __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } +#endif + + __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { +#if defined(__AVX512VL__) + return _mm256_mask_blend_pd(m, f, t); +#else + return _mm256_blendv_pd(f, t, m); +#endif + } + + __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) { + const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) { +#if defined(__AVX512VL__) + return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); +#else + return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vdouble4 shuffle(const vdouble4& v) { + return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template + __forceinline vdouble4 shuffle(const vdouble4& v) { + return shuffle(v); + } + + template + __forceinline vdouble4 shuffle2(const vdouble4& v) { + return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); + } + + __forceinline double toScalar(const vdouble4& v) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } + __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } + __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h new file mode 100644 index 0000000000..4eec7d2f6a --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h @@ -0,0 +1,356 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 64-bit double type */ + template<> + struct vdouble<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512d v; + double i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble8& t) { v = t.v; } + __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m512d& t) { v = t; } + __forceinline operator __m512d() const { return v; } + __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } + + __forceinline vdouble(double i) { + v = _mm512_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm512_set4_pd(d,c,b,a); + } + + __forceinline vdouble(double a0, double a1, double a2, double a3, + double a4, double a5, double a6, double a7) + { + v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { + _mm512_stream_pd((double*)ptr, a); + } + + static __forceinline vdouble8 loadu(const void* addr) { + return _mm512_loadu_pd((double*)addr); + } + + static __forceinline vdouble8 load(const vdouble8* addr) { + return _mm512_load_pd((double*)addr); + } + + static __forceinline vdouble8 load(const double* addr) { + return _mm512_load_pd(addr); + } + + static __forceinline void store(void* ptr, const vdouble8& v) { + _mm512_store_pd(ptr, v); + } + + static __forceinline void storeu(void* ptr, const vdouble8& v) { + _mm512_storeu_pd(ptr, v); + } + + static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { + _mm512_mask_storeu_pd(ptr, mask, f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { + _mm512_mask_store_pd(addr, mask, v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) { + _mm512_mask_compressstoreu_pd(addr, mask, reg); + } + + static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { + return _mm512_mask_compress_pd(a, mask, b); + } + + static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } + __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } + + __forceinline vdouble8 operator +(const vdouble8& a) { return a; } + __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } + __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } + __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } + + __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } + __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } + __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } + + __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } + __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } + __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } + + __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } + __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } + __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } + + __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } + __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } + __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } + + __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } + __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } + __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } + + __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } + + __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } + __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } + __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } + + __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } + __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } + __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } + + __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } + __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } + + __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } + __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } + __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } + __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } + __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } + __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } + + __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } + __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } + + __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } + __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } + + __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } + __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } + + __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } + __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } + + __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } + __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } + __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } + + __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } + __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } + + __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } + __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } + + __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } + __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } + + __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } + __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } + + __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } + __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } + + __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { + return _mm512_mask_or_pd(f,m,t,t); + } + + __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) { + const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) { + return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b)); + } + + __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) { + return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return shuffle(v); + } + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return shuffle4(v); + } + + template + __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { + return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); + } + + __forceinline double toScalar(const vdouble8& v) { + return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } + __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } + __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { + return _mm512_permutexvar_pd(index, v); + } + + __forceinline vdouble8 reverse(const vdouble8& a) { + return permute(a, vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h new file mode 100644 index 0000000000..aed2419b77 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h @@ -0,0 +1,771 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 float type */ + template<> + struct vfloat<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512 v; + float f[16]; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat16& t) { v = t; } + __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } + + __forceinline vfloat(const __m512& t) { v = t; } + __forceinline operator __m512() const { return v; } + __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } + __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } + + __forceinline vfloat(float f) { + v = _mm512_set1_ps(f); + } + + __forceinline vfloat(float a, float b, float c, float d) { + v = _mm512_set4_ps(a, b, c, d); + } + + __forceinline vfloat(const vfloat4& i) { + v = _mm512_broadcast_f32x4(i); + } + + __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { + v = _mm512_castps128_ps512(a); + v = _mm512_insertf32x4(v, b, 1); + v = _mm512_insertf32x4(v, c, 2); + v = _mm512_insertf32x4(v, d, 3); + } + + __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { + v = _mm512_broadcast_f32x4(a); + v = _mm512_mask_broadcast_f32x4(v,mask,b); + } + + __forceinline vfloat(const vfloat8& i) { + v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); + } + + __forceinline vfloat(const vfloat8& a, const vfloat8& b) { + v = _mm512_castps256_ps512(a); +#if defined(__AVX512DQ__) + v = _mm512_insertf32x8(v, b, 1); +#else + v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); +#endif + } + + /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ + __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { + __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); + aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); + v = _mm512_castpd_ps(aa); + } + + __forceinline explicit vfloat(const vint16& a) { + v = _mm512_cvtepi32_ps(a); + } + + __forceinline explicit vfloat(const vuint16& a) { + v = _mm512_cvtepu32_ps(a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } + static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } + + static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { + _mm512_stream_ps((float*)ptr,a); + } + + static __forceinline vfloat16 broadcast(const float* f) { + return _mm512_set1_ps(*f); + } + + static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) { + return _mm512_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) { + return _mm512_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) { + return _mm512_mask_expand_ps(b, mask, a); + } + + static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) { + return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr); + } + + static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) { + _mm512_mask_compressstoreu_ps(addr, mask, reg); + } + + static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) { + //_mm512_mask_compressstoreu_ps(addr,mask,reg); + *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg)); + } + + template + static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { + return _mm512_i32gather_ps(index, ptr, scale); + } + + template + static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { + vfloat16 r = zero; + return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); + } + + template + static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { + _mm512_i32scatter_ps(ptr, index, v, scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { + _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } + __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } + __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } + __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } + + __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } + __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } + + __forceinline vfloat16 operator +(const vfloat16& a) { return a; } + __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } + + __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } + __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } + + __forceinline vfloat16 rcp(const vfloat16& a) { +#if defined(__AVX512ER__) + return _mm512_rcp28_ps(a); +#else + const vfloat16 r = _mm512_rcp14_ps(a); + return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); +#endif + } + + __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } + __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } + + __forceinline vfloat16 rsqrt(const vfloat16& a) + { +#if defined(__AVX512VL__) + const vfloat16 r = _mm512_rsqrt14_ps(a); + return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, + _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); +#else + return _mm512_rsqrt28_ps(a); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } + __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } + __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } + + __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } + __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } + __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } + + __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } + __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } + __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } + + __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } + __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } + __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } + + __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } + __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } + __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); + } + + __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { + return _mm512_min_ps(a,b); + } + __forceinline vfloat16 min(const vfloat16& a, float b) { + return _mm512_min_ps(a,vfloat16(b)); + } + __forceinline vfloat16 min(const float& a, const vfloat16& b) { + return _mm512_min_ps(vfloat16(a),b); + } + + __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { + return _mm512_max_ps(a,b); + } + __forceinline vfloat16 max(const vfloat16& a, float b) { + return _mm512_max_ps(a,vfloat16(b)); + } + __forceinline vfloat16 max(const float& a, const vfloat16& b) { + return _mm512_max_ps(vfloat16(a),b); + } + + __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); } + __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_mask_min_ps(c,mask,a,b); + }; + __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_mask_max_ps(c,mask,a,b); + }; + + __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { +#if !defined(__AVX512ER__) // SKX + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_min_epi32(ai,bi); + return _mm512_castsi512_ps(ci); +#else // KNL + return min(a,b); +#endif + } + + __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { +#if !defined(__AVX512ER__) // SKX + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_max_epi32(ai,bi); + return _mm512_castsi512_ps(ci); +#else // KNL + return max(a,b); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } + __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } + __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } + + __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); } + + __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); } + __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); } + __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Operators with rounding + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 madd_round_up (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mul_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 add_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 sub_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 div_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_msub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_mul_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_sub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } + __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } + + __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } + __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } + + __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } + __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } + + __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } + __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } + __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } + + __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } + __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } + + __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } + __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } + + __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } + __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } + + __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } + __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } + + __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } + __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } + + __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { + return _mm512_mask_blend_ps(s, f, t); + } + + __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { + return madd(t,b-a,a); + } + + __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b) + { + vfloat16 c = a; + a = select(m,b,a); + b = select(m,c,b); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 floor(const vfloat16& a) { + return _mm512_floor_ps(a); + } + __forceinline vfloat16 ceil (const vfloat16& a) { + return _mm512_ceil_ps(a); + } + __forceinline vfloat16 round (const vfloat16& a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + __forceinline vint16 floori (const vfloat16& a) { + return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } + __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } + + template + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) { + /* mask should be 8-bit but is 16-bit to reuse for interleave_even */ + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) { + /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */ + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 permute(vfloat16 v, __m512i index) { + return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); + } + + __forceinline vfloat16 reverse(const vfloat16& v) { + return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + } + + template + __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + template + __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + __forceinline vfloat16 shift_left_1(const vfloat16& a) { + vfloat16 z = zero; + return mask_align_shift_right<15>(0xfffe,z,a,a); + } + + __forceinline vfloat16 shift_right_1(const vfloat16& x) { + return align_shift_right<1>(zero,x); + } + + __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } + + + template __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } + + template + vfloat extractN(const vfloat16& v); + + template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } + template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } + template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } + + template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } + + template __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + + template __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } + template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + vfloat16 a0a1_c0c1 = interleave_even(r0, r1); + vfloat16 a2a3_c2c3 = interleave_even(r2, r3); + vfloat16 b0b1_d0d1 = interleave_odd (r0, r1); + vfloat16 b2b3_d2d3 = interleave_odd (r2, r3); + + c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3); + c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3); + c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3); + c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3); +#else + vfloat16 a0a2_b0b2 = unpacklo(r0, r2); + vfloat16 c0c2_d0d2 = unpackhi(r0, r2); + vfloat16 a1a3_b1b3 = unpacklo(r1, r3); + vfloat16 c1c3_d1d3 = unpackhi(r1, r3); + + c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); + c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); + c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); + c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); +#endif + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, + const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, + const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), + c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; + transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); + + vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; + transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); + + c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, + const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, + const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), + vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), + c0, c1, c2, c3, c4, c5, c6, c7); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } + __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } + + __forceinline size_t select_min(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_max(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(pos_inf)); + const vbool16 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(neg_inf)); + const vbool16 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + __forceinline vfloat16 prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + __forceinline vfloat16 prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<16-1>(v,z)); + v = min(v,align_shift_right<16-2>(v,z)); + v = min(v,align_shift_right<16-4>(v,z)); + v = min(v,align_shift_right<16-8>(v,z)); + return v; + } + + __forceinline vfloat16 prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<16-1>(v,z)); + v = max(v,align_shift_right<16-2>(v,z)); + v = max(v,align_shift_right<16-4>(v,z)); + v = max(v,align_shift_right<16-8>(v,z)); + return v; + } + + + __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<1>(z,v)); + v = min(v,align_shift_right<2>(z,v)); + v = min(v,align_shift_right<4>(z,v)); + v = min(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<1>(z,v)); + v = max(v,align_shift_right<2>(z,v)); + v = max(v,align_shift_right<4>(z,v)); + v = max(v,align_shift_right<8>(z,v)); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z) + { + vfloat16 f = zero; + f = select(0x1111,vfloat16::broadcast(&x),f); + f = select(0x2222,vfloat16::broadcast(&y),f); + f = select(0x4444,vfloat16::broadcast(&z),f); + return f; + } + + __forceinline vfloat16 loadAOS4to16f(unsigned int index, + const vfloat16& x, + const vfloat16& y, + const vfloat16& z) + { + vfloat16 f = zero; + f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); + f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); + f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); + return f; + } + + __forceinline vfloat16 loadAOS4to16f(unsigned int index, + const vfloat16& x, + const vfloat16& y, + const vfloat16& z, + const vfloat16& fill) + { + vfloat16 f = fill; + f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); + f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); + f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); + return f; + } + + __forceinline vfloat16 rcp_safe(const vfloat16& a) { + return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h new file mode 100644 index 0000000000..5732c0fbc8 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h @@ -0,0 +1,925 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide SSE float type */ + template<> + struct vfloat<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; float f[4]; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat4& other) { v = other.v; } + __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } + + __forceinline vfloat(__m128 a) : v(a) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator __m128&() { return v; } + + __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} + + __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} +#if defined(__aarch64__) + __forceinline explicit vfloat(const vuint4& x) { + v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); + } +#else + __forceinline explicit vfloat(const vuint4& x) { + const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); + const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 + const __m128 af = _mm_cvtepi32_ps(a); + const __m128 bf = _mm_castsi128_ps(b); + v = _mm_add_ps(af,bf); + } +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } + static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } + + static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) { + return _mm_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) { + return _mm_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } +#else + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } +#endif + +#if defined(__AVX__) + static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } +#else + static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } +#endif + + static __forceinline vfloat4 load_nt (const float* ptr) { +#if defined (__SSE4_1__) + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); +#else + return _mm_load_ps(ptr); +#endif + } + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const int8_t* ptr) { + return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const int8_t* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const int8_t* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const uint8_t* ptr) { + return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const uint8_t* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const uint8_t* ptr) { + //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const short* ptr) { + return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const short* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const short* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + + static __forceinline vfloat4 load(const unsigned short* ptr) { + return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); + } + + static __forceinline void store_nt(void* ptr, const vfloat4& v) + { +#if defined (__SSE4_1__) +#if defined(__aarch64__) + _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v)); +#else + _mm_stream_ps((float*)ptr,v); +#endif +#else + _mm_store_ps((float*)ptr,v); +#endif + } + + template + static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_ps(ptr, index, scale); +#else + return vfloat4( + *(float*)(((int8_t*)ptr)+scale*index[0]), + *(float*)(((int8_t*)ptr)+scale*index[1]), + *(float*)(((int8_t*)ptr)+scale*index[2]), + *(float*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { + vfloat4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_ps((float*)ptr, index, v, scale); +#else + *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); +#else + if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } + + friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_ps(m, f, t); +#elif defined(__SSE4_1__) || (defined(__aarch64__)) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } + __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } + __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } + + __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } + __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } + + __forceinline vfloat4 operator +(const vfloat4& a) { return a; } +#if defined(__aarch64__) + __forceinline vfloat4 operator -(const vfloat4& a) { + return vnegq_f32(a); + } +#else + __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif + +#if defined(__aarch64__) + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } +#else + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#endif + +#if defined(__AVX512VL__) + __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } +#else + __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } +#endif + +#if defined(__aarch64__) + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); } +#else + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif + + __forceinline vfloat4 rcp(const vfloat4& a) + { +#if defined(__aarch64__) +#if defined(BUILD_IOS) + return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); +#else //BUILD_IOS + __m128 reciprocal = _mm_rcp_ps(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp. + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return (const vfloat4)reciprocal; +#endif // BUILD_IOS +#else + +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rcp14_ps(a); +#else + const vfloat4 r = _mm_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); +#endif + +#endif //defined(__aarch64__) + } + __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } + __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } + + __forceinline vfloat4 rsqrt(const vfloat4& a) + { +#if defined(__aarch64__) + vfloat4 r = _mm_rsqrt_ps(a); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rsqrt14_ps(a); +#else + const vfloat4 r = _mm_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_fmadd_ps(_mm_set1_ps(1.5f), r, + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#else + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + +#endif + } + + __forceinline vboolf4 isnan(const vfloat4& a) { +#if defined(__aarch64__) + const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff)); +#else + const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); +#endif +#if defined(__AVX512VL__) + return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); +#else + return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } + __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } + __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } + + __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } + __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } + __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } + + __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } + __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } + __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } + + __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } + __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } + __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } + + __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } + __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } + + __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } + __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } + __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } + + __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } + __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } + __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } + +#if defined(__SSE4_1__) || defined(__aarch64__) + + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } +#else + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + return min(a,b); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + return max(a,b); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } +#else + +#if defined(__aarch64__) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return _mm_madd_ps(a, b, c); //a*b+c; + } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return _mm_msub_ps(a, b, c); //-a*b+c; + } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return vnegq_f32(vfmaq_f32(c,a, b)); + } +#else + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } +#endif + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } + __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } + + __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } + __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } + + __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } + __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } + + __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } + __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } +#if defined(__aarch64__) + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } +#else + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } +#endif + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } +#endif + + __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } + __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } + + __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } + __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } + + __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } + __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } + + __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } + __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } + + __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } + __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } + + __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } + __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } + + __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } + __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } + __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } + __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } + __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } + __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) + { +#if defined(__SSE4_1__) + return _mm_blend_ps(f, t, mask); +#else + return select(vboolf4(mask), t, f); +#endif + } + +#if defined(__aarch64__) + template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero)); + } + template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F)); + } + template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0)); + } + template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF)); + } + template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00)); + } + template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F)); + } + template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0)); + } + template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF)); + } + template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000)); + } + template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F)); + } + template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0)); + } + template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF)); + } + template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00)); + } + template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F)); + } + template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0)); + } + template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF)); + } +#endif + + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid(const vfloat4& v) { + return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite(const vfloat4& a) { + return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { + return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? +#elif defined (__SSE4_1__) + __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd +#else + __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } + __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } +#endif + __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } + + __forceinline vint4 floori(const vfloat4& a) { +#if defined(__aarch64__) + return vcvtq_s32_f32(floor(a)); +#elif defined(__SSE4_1__) + return vint4(floor(a)); +#else + return vint4(a-vfloat4(0.5f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } + +#if defined(__aarch64__) + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } +#endif + +#if defined (__SSSE3__) + __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) { + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); + } +#endif + +#if defined(__aarch64__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); } +#elif defined(__SSE3__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } +#endif + + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline float extract(const vfloat4& a); + template<> __forceinline float extract<0>(const vfloat4& b) { + return b[0]; + } + template<> __forceinline float extract<1>(const vfloat4& b) { + return b[1]; + } + template<> __forceinline float extract<2>(const vfloat4& b) { + return b[2]; + } + template<> __forceinline float extract<3>(const vfloat4& b) { + return b[3]; + } +#elif defined (__SSE4_1__) && !defined(__GNUC__) + template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#else + template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle(a)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#endif + + +#if defined(__aarch64__) + template __forceinline vfloat4 insert(const vfloat4& a, float b); + template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[3] = b; + return c; + } +#elif defined (__SSE4_1__) + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert(a, b); } + template __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert(a, _mm_set_ss(b)); } +#else + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } + template __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + __forceinline float toScalar(const vfloat4& v) { + return v[0]; + } +#else + __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } +#endif + __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) { + return vfloat4::broadcast(&a[k]); + } + + __forceinline vfloat4 shift_right_1(const vfloat4& x) { + return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); + } + +#if defined (__AVX2__) + __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { + return _mm_permutevar_ps(a,index); + } + + __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } + +#endif + +#if defined(__AVX512VL__) + template + __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { + return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting Network + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 sort_ascending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = min(a0,b0); + const vfloat4 d0 = max(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = min(a1,b1); + const vfloat4 d1 = max(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = min(a2,b2); + const vfloat4 d2 = max(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vfloat4 sort_descending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = max(a0,b0); + const vfloat4 d0 = min(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = max(a1,b1); + const vfloat4 d1 = min(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = max(a2,b2); + const vfloat4 d2 = min(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } +#else + __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } +#endif + +#if defined(__aarch64__) + __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } + __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } + __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } +#else + __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } +#endif + + __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(pos_inf)); + const vbool4 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(neg_inf)); + const vbool4 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float dot(const vfloat4& a, const vfloat4& b) { + return reduce_add(a*b); + } + + __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) + { + const vfloat4 a0 = a; + const vfloat4 b0 = shuffle<1,2,0,3>(b); + const vfloat4 a1 = shuffle<1,2,0,3>(a); + const vfloat4 b1 = b; + return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } + +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h new file mode 100644 index 0000000000..3c7e4a8cdc --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h @@ -0,0 +1,847 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX float type */ + template<> + struct vfloat<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { __m256 v; float f[8]; int i[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat8& other) { v = other.v; } + __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } + + __forceinline vfloat(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator __m256&() { return v; } + + __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + + __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {} + __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} + __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat8 broadcast(const void* a) { + return _mm256_broadcast_ss((float*)a); + } + + static __forceinline vfloat8 broadcast2(const float* a, const float* b) { +#if defined(__INTEL_COMPILER) + const vfloat8 v0 = _mm256_broadcast_ss(a); + const vfloat8 v1 = _mm256_broadcast_ss(b); + return _mm256_blend_ps(v1, v0, 0xf); +#else + return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a); +#endif + } + + static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) { + return _mm256_broadcast_ps((__m128*)ptr); + } + + static __forceinline vfloat8 load(const int8_t* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const uint8_t* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const short* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } + static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) { + return _mm256_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) { + return _mm256_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__aarch64__) + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } +#else + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } +#endif + +#if defined(__AVX2__) + static __forceinline vfloat8 load_nt(void* ptr) { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); + } +#endif + + static __forceinline void store_nt(void* ptr, const vfloat8& v) { + _mm256_stream_ps((float*)ptr,v); + } + + template + static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm256_i32gather_ps(ptr, index ,scale); +#else + return vfloat8( + *(float*)(((int8_t*)ptr)+scale*index[0]), + *(float*)(((int8_t*)ptr)+scale*index[1]), + *(float*)(((int8_t*)ptr)+scale*index[2]), + *(float*)(((int8_t*)ptr)+scale*index[3]), + *(float*)(((int8_t*)ptr)+scale*index[4]), + *(float*)(((int8_t*)ptr)+scale*index[5]), + *(float*)(((int8_t*)ptr)+scale*index[6]), + *(float*)(((int8_t*)ptr)+scale*index[7])); +#endif + } + + template + static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { + vfloat8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]); + return r; + #endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); +#else + *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } + __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } + + __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } + __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } + + __forceinline vfloat8 operator +(const vfloat8& a) { return a; } +#if !defined(__aarch64__) + __forceinline vfloat8 operator -(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); + } +#else + __forceinline vfloat8 operator -(const vfloat8& a) { + __m256 res; + res.lo = vnegq_f32(a.v.lo); + res.hi = vnegq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) +__forceinline vfloat8 abs(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); + return _mm256_and_ps(a, mask); +} +#else +__forceinline vfloat8 abs(const vfloat8& a) { + __m256 res; + res.lo = vabsq_f32(a.v.lo); + res.hi = vabsq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } +#else + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); } +#endif + __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } + + + static __forceinline vfloat8 rcp(const vfloat8& a) + { +#if defined(BUILD_IOS) && defined(__aarch64__) + // ios devices are faster doing full divide, no need for NR fixup + vfloat8 ret; + const float32x4_t one = vdupq_n_f32(1.0f); + ret.v.lo = vdivq_f32(one, a.v.lo); + ret.v.hi = vdivq_f32(one, a.v.hi); + return ret; +#endif + +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rcp14_ps(a); +#else + const vfloat8 r = _mm256_rcp_ps(a); +#endif + +#if defined(__AVX2__) //&& !defined(aarch64) + return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); +#else + return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); +#endif + } + __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } + __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } + + static __forceinline vfloat8 rsqrt(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rsqrt14_ps(a); +#else + const vfloat8 r = _mm256_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#else + return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } + __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } + __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } + + __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } + __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } + __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } + + __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } + __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } + __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } + + __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } + __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } + __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } + + __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } + __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } + + __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } + __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } + __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } + + __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } + __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } + __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } + + /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ +#if defined(__AVX2__) + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + +#else + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + return asFloat(min(asInt(a),asInt(b))); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + return asFloat(max(asInt(a),asInt(b))); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } +#else + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } + __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } + + __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } + __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } + + __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } + __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } + + __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } + __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_mask_blend_ps(m, f, t); + } +#elif !defined(__aarch64__) + __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } + __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } + __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } + __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } + + __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } +#else + __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); } + __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); } + __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); } + __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); } + __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); } + __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); } + + __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } + +#endif + + template + __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { + return _mm256_blend_ps(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } + __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } + + __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } + __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } + + __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } + __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } + + __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } + __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } + + __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } + __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } + + __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } + __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } + + __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } + __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } + __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } + __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } + __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } + __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } +#endif + + __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid (const vfloat8& v) { + return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); + } + + __forceinline bool is_finite (const vfloat8& a) { + return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { + return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if !defined(__aarch64__) + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +#else + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); } +#endif + + + __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } + + template + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat8 shuffle4(const vfloat8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + +#if !defined(__aarch64__) + template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } +#endif + + __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } + template __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } + template __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } + + __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } + + __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); } + +#if defined (__AVX2__) && !defined(__aarch64__) + __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { + return _mm256_permutevar8x32_ps(a, index); + } +#endif + +#if defined(__AVX512VL__) + template + static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { + return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); + } +#endif + +#if defined (__AVX_I__) + template + static __forceinline vint4 convert_to_hf16(const vfloat8& a) { + return _mm256_cvtps_ph(a, mode); + } + + static __forceinline vfloat8 convert_from_hf16(const vint4& a) { + return _mm256_cvtph_ps(a); + } +#endif + + __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) { + return vfloat4::broadcast(&a[k]); + } + + __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) { + return vfloat8::broadcast(&a[k]); + } + +#if defined(__AVX512VL__) + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + return align_shift_right<1>(zero,x); + } +#else + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + const vfloat8 t0 = shuffle<1,2,3,0>(x); + const vfloat8 t1 = shuffle4<1,0>(t0); + return _mm256_blend_ps(t0,t1,0x88); + } +#endif + + __forceinline vint8 floori(const vfloat8& a) { + return vint8(floor(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) + { + vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); + vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); + c0 = shuffle4<0,2>(h0,h4); + c1 = shuffle4<0,2>(h1,h5); + c2 = shuffle4<0,2>(h2,h6); + c3 = shuffle4<0,2>(h3,h7); + c4 = shuffle4<1,3>(h0,h4); + c5 = shuffle4<1,3>(h1,h5); + c6 = shuffle4<1,3>(h2,h6); + c7 = shuffle4<1,3>(h3,h7); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if !defined(__aarch64__) + __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } +#else + __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); } + __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); } + __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); } + __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); } + +#endif + __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(pos_inf)); + const vbool8 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(neg_inf)); + const vbool8 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators (pairs of Vec3fa's) + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + // return vreduce_add4(a*b); + //} + + __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + return _mm256_dp_ps(a,b,0x7F); + } + + __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) + { + const vfloat8 a0 = a; + const vfloat8 b0 = shuffle<1,2,0,3>(b); + const vfloat8 a1 = shuffle<1,2,0,3>(a); + const vfloat8 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } + //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } + //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } + //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } + __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } + //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } + //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } + //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } + //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } + + //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { + // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + //} + + //////////////////////////////////////////////////////////////////////////////// + /// In Register Sorting + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 sort_ascending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = min(a0,b0); + const vfloat8 d0 = max(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = min(a1,b1); + const vfloat8 d1 = max(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = min(a2,b2); + const vfloat8 d2 = max(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = min(a3,b3); + const vfloat8 d3 = max(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = min(a4,b4); + const vfloat8 d4 = max(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = min(a5,b5); + const vfloat8 d5 = max(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vfloat8 sort_descending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = max(a0,b0); + const vfloat8 d0 = min(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = max(a1,b1); + const vfloat8 d1 = min(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = max(a2,b2); + const vfloat8 d2 = min(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = max(a3,b3); + const vfloat8 d3 = min(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = max(a4,b4); + const vfloat8 d4 = min(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = max(a5,b5); + const vfloat8 d5 = min(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h new file mode 100644 index 0000000000..3249bc2b45 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h @@ -0,0 +1,490 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 integer type */ + template<> + struct vint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint16& t) { v = t.v; } + __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } + + __forceinline vint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vint(int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vint(int a, int b, int c, int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vint(int a0 , int a1 , int a2 , int a3, + int a4 , int a5 , int a6 , int a7, + int a8 , int a9 , int a10, int a11, + int a12, int a13, int a14, int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vint(const vint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { + v = _mm512_castsi128_si512(a); + v = _mm512_inserti32x4(v, b, 1); + v = _mm512_inserti32x4(v, c, 2); + v = _mm512_inserti32x4(v, d, 3); + } + + __forceinline vint(const vint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vint(const vint8& a, const vint8& b) { + v = _mm512_castsi256_si512(a); + v = _mm512_inserti64x4(v, b, 1); + } + + __forceinline explicit vint(const __m512& f) { + v = _mm512_cvtps_epi32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } + + static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } + + static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } + static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } + + static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } + static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) { + _mm512_mask_compressstoreu_epi32(addr,mask,reg); + } + + static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) { + //_mm512_mask_compressstoreu_epi32(addr,mask,reg); + *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); + } + + static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template + static __forceinline vint16 gather(const int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template + static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template + static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template + static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + static __forceinline vint16 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vint16 operator +(const vint16& a) { return a; } + __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } + __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } + + __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } + __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } + + __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } + __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } + __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } + + __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } + __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } + + __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } + __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } + + __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } + __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } + + __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } + + __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } + + __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } + __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } + __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } + __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } + __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } + + __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } + __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } + __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } + + __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } + + __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } + __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } + + __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } + __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } + + __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } + __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } + + __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } + __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } + + __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } + __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } + + __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } + __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } + __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } + + __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } + __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } + + __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } + __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } + + __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } + __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } + + __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } + __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } + + __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } + __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } + + __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + + + __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) { + const vint16 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) { + return _mm512_mask_test_epi32_mask(m,a,b); + } + + __forceinline vboolf16 test(const vint16& a, const vint16& b) { + return _mm512_test_epi32_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } + __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } + + template + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline int toScalar(const vint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + template __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } + + __forceinline size_t extract64bit(const vint16& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + template + vint extractN(const vint16& v); + + template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } + template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } + template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } + template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } + + template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } + template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } + + template __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } + template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } + + template __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } + template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } + __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } + __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 conflict(const vint16& index) + { + return _mm512_conflict_epi32(index); + } + + __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) + { + return _mm512_mask_conflict_epi32(dest,mask,index); + } + + __forceinline vint16 convert_uint32_t(const __m512& f) { + return _mm512_cvtps_epu32(f); + } + + __forceinline vint16 permute(vint16 v, vint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vint16 reverse(const vint16 &a) { + return permute(a,vint16(reverse_step)); + } + + __forceinline vint16 prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vint16 reverse_prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + /* this should use a vbool8 and a vint8_64...*/ + template + __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) + { +#if defined(__AVX512PF__) + _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h new file mode 100644 index 0000000000..96f105a7c5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h @@ -0,0 +1,681 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint4& a) { v = a.v; } + __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } + + __forceinline vint(__m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} + + __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} + __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} + __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} + + __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { + return _mm_mask_compress_epi32(v, mask, v); + } + static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { + return _mm_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + + +#if defined(__aarch64__) + static __forceinline vint4 load(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vint4 loadu(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) + static __forceinline vint4 load(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vint4 loadu(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } +#else + + static __forceinline vint4 load(const uint8_t* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + + static __forceinline vint4 loadu(const uint8_t* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + +#endif + + static __forceinline vint4 load(const unsigned short* ptr) { +#if defined(__aarch64__) + return __m128i(vmovl_u16(vld1_u16(ptr))); +#elif defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store(uint8_t* ptr, const vint4& v) { +#if defined(__aarch64__) + int32x4_t x = v; + uint16x4_t y = vqmovn_u32(uint32x4_t(x)); + uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); + vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0); +#elif defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(int*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (uint8_t)v[i]; +#endif + } + + static __forceinline void store(unsigned short* ptr, const vint4& v) { +#if defined(__aarch64__) + uint32x4_t x = uint32x4_t(v.v); + uint16x4_t y = vqmovn_u32(x); + vst1_u16(ptr, y); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; +#endif + } + + static __forceinline vint4 load_nt(void* ptr) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vint4& v) { +#if !defined(__aarch64__) && defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template + static __forceinline vint4 gather(const int* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_epi32(ptr, index, scale); +#else + return vint4( + *(int*)(((int8_t*)ptr)+scale*index[0]), + *(int*)(((int8_t*)ptr)+scale*index[1]), + *(int*)(((int8_t*)ptr)+scale*index[2]), + *(int*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { + vint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_epi32((int*)ptr, index, v, scale); +#else + *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); +#else + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + +#if defined(__x86_64__) || defined(__aarch64__) + static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__aarch64__) + return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v)); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vint4 operator +(const vint4& a) { return a; } + __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } +#if defined(__aarch64__) + __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); } +#elif defined(__SSSE3__) + __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } + __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } + __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } + + __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } + __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } +#else + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +#endif + __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } + __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } + + __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } + __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } + __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } + + __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } + __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } + __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } + + __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } + __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } + __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } + + __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); } + __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); } + + __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } + __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } + __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } + __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } + + __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } + __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } + __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } +#endif + + __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } + __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } + + __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } + __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } + + __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } + __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } + __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } + + __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } + __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } + + __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } + __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } + + __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } + __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } + + __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } + __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } + + __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } + __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } + + __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } + __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } + __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } + __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } + __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } + __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vint4 select(const vint4& t, const vint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } + + __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } + __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } + __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } + __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } + __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + +#if defined(__aarch64__) + template + __forceinline vint4 shuffle(const vint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vint4 shuffle(const vint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + template + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } +#endif +#if defined(__SSE3__) + template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template + __forceinline vint4 shuffle(const vint4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline int extract(const vint4& b); + template __forceinline vint4 insert(const vint4& a, const int b); +#elif defined(__SSE4_1__) + template __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } + template __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } +#else + template __forceinline int extract(const vint4& b) { return b[src&3]; } + template __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + template<> __forceinline int extract<0>(const vint4& b) { + return b.v[0]; + } + template<> __forceinline int extract<1>(const vint4& b) { + return b.v[1]; + } + template<> __forceinline int extract<2>(const vint4& b) { + return b.v[2]; + } + template<> __forceinline int extract<3>(const vint4& b) { + return b.v[3]; + } + template<> __forceinline vint4 insert<0>(const vint4& a, int b) + { + vint4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vint4 insert<1>(const vint4& a, int b) + { + vint4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vint4 insert<2>(const vint4& a, int b) + { + vint4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vint4 insert<3>(const vint4& a, int b) + { + vint4 c = a; + c[3] = b; + return c; + } + + __forceinline int toScalar(const vint4& v) { + return v[0]; + } + + __forceinline size_t toSizeT(const vint4& v) { + uint64x2_t x = uint64x2_t(v.v); + return x[0]; + } +#else + template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } + + __forceinline size_t toSizeT(const vint4& v) { +#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround + return toScalar(v); +#elif defined(__ARM_NEON) + // FIXME(LTE): Do we need a swap(i.e. use lane 1)? + return vgetq_lane_u64(*(reinterpret_cast(&v)), 0); +#else + return _mm_cvtsi128_si64(v); +#endif + } +#endif + +#if defined(__AVX512VL__) + + __forceinline vint4 permute(const vint4 &a, const vint4 &index) { + return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); + } + + template + __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { + return _mm_alignr_epi32(a, b, i); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) || defined(__SSE4_1__) + +#if defined(__aarch64__) + __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); } + + __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); } + __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); } + __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); } +#else + __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } +#endif + + __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umin(a0,b0); + const vint4 d0 = umax(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umin(a1,b1); + const vint4 d1 = umax(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umin(a2,b2); + const vint4 d2 = umax(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umax(a0,b0); + const vint4 d0 = umin(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umax(a1,b1); + const vint4 d1 = umin(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umax(a2,b2); + const vint4 d2 = umin(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + +#else + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = min(a0,b0); + const vint4 d0 = max(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = min(a1,b1); + const vint4 d1 = max(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = min(a2,b2); + const vint4 d2 = max(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = max(a0,b0); + const vint4 d0 = min(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = max(a1,b1); + const vint4 d1 = min(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = max(a2,b2); + const vint4 d2 = min(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h new file mode 100644 index 0000000000..25a771284d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint8_avx.h @@ -0,0 +1,464 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + +#if !defined(__aarch64__) + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } +#else + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } +#endif + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vint8 load(const uint8_t* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const uint8_t* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 load(const unsigned short* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned short* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline void store(uint8_t* ptr, const vint8& i) { + vint4 il(i.vl); + vint4 ih(i.vh); + vint4::store(ptr + 0,il); + vint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vint8 gather(const int* ptr, const vint8& index) { + return vint8( + *(int*)(((int8_t*)ptr)+scale*index[0]), + *(int*)(((int8_t*)ptr)+scale*index[1]), + *(int*)(((int8_t*)ptr)+scale*index[2]), + *(int*)(((int8_t*)ptr)+scale*index[3]), + *(int*)(((int8_t*)ptr)+scale*index[4]), + *(int*)(((int8_t*)ptr)+scale*index[5]), + *(int*)(((int8_t*)ptr)+scale*index[6]), + *(int*)(((int8_t*)ptr)+scale*index[7])); + } + + template + static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { + vint8 r = zero; + if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]); + return r; + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { + *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } + __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } + + __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } + __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } + + __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } + __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } + + __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + __forceinline vint8 notand(const vboolf8& m, const vint8& f) { + return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h new file mode 100644 index 0000000000..4937d972cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h @@ -0,0 +1,512 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(uint8_t* ptr, const vint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vint8 gather(const int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32(ptr, index, scale); + } + + template + static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { + vint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } +#else + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } + __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } + + __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } + + __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template + __forceinline vint8 select(const vint8& t, const vint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + +#if !defined(__aarch64__) + +__forceinline vint8 permute(const vint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + + + template + static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + + __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h new file mode 100644 index 0000000000..de3ebc16a7 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h @@ -0,0 +1,358 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX2 64-bit long long type */ + template<> + struct vllong<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256i v; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong4& t) { v = t.v; } + __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } + + __forceinline vllong(const __m256i& t) { v = t; } + __forceinline operator __m256i() const { return v; } + __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } + + + __forceinline vllong(long long i) { + v = _mm256_set1_epi64x(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm256_set_epi64x(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} + __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); + } + + static __forceinline vllong4 loadu(const void* addr) + { + return _mm256_loadu_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const vllong4* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const long long* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline void store(void* ptr, const vllong4& v) { + _mm256_store_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong4& v) { + _mm256_storeu_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_storeu_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_store_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline vllong4 broadcast64bit(size_t v) { + return _mm256_set1_epi64x(v); + } + + static __forceinline size_t extract64bit(const vllong4& v) + { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { + #if defined(__AVX512VL__) + return _mm256_mask_blend_epi64(m, f, t); + #else + return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); + #endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } +#else + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } +#endif + + __forceinline vllong4 operator +(const vllong4& a) { return a; } + __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } + __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } + __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } + + __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } + __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } + __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } + + /* only low 32bit part */ + __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } + __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } + __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } + + __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } + __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } + __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } + + __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } + __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } + __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } + + __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } + __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } + __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } + + __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } + //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } + + __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } + //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } + //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } + + __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } + + //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } + //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } + //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } + + //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } + //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } + //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } + +#if defined(__AVX512VL__) + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } +#else + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } + __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } + + __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } + __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } + + __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } + __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } + + __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } + __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } + + __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } + __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } + + __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } + //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } +#endif + + __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } + __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } + + __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } + __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } + + __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } + __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } + + __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } + __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } + + __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } + __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } + + __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } + __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } + + __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } + __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } + __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } + __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } + __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } + __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } +#endif + + __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) { + const vllong4 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold4 test(const vllong4& a, const vllong4& b) { +#if defined(__AVX512VL__) + return _mm256_test_epi64_mask(a,b); +#else + return _mm256_testz_si256(a,b); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vllong4 shuffle(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template + __forceinline vllong4 shuffle(const vllong4& v) { + return shuffle(v); + } + + template + __forceinline vllong4 shuffle2(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); + } + + __forceinline long long toScalar(const vllong4& v) { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + +#if defined(__AVX512VL__) + __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { + // workaround for GCC 7.x +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) + return _mm256_permutex2var_epi64(a,index,a); +#else + return _mm256_permutexvar_epi64(index,a); +#endif + } + + __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { + return _mm256_permutex2var_epi64(a,index,b); + } + +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + + __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } + __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } + __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } + __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } + __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } + __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h new file mode 100644 index 0000000000..76dddd8991 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h @@ -0,0 +1,381 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 64-bit long long type */ + template<> + struct vllong<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512i v; + long long i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong8& t) { v = t.v; } + __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } + + __forceinline vllong(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vllong(long long i) { + v = _mm512_set1_epi64(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm512_set4_epi64(d,c,b,a); + } + + __forceinline vllong(long long a0, long long a1, long long a2, long long a3, + long long a4, long long a5, long long a6, long long a7) + { + v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vllong(const vllong<4>& i) { + v = _mm512_broadcast_i64x4(i); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} + __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vllong8 loadu(const void* addr) { + return _mm512_loadu_si512(addr); + } + + static __forceinline vllong8 load(const vllong8* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const long long* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const uint8_t* ptr) { + return _mm512_cvtepu8_epi64(*(__m128i*)ptr); + } + + static __forceinline void store(void* ptr, const vllong8& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong8& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { + _mm512_mask_storeu_epi64(ptr,mask,f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { + _mm512_mask_store_epi64(addr,mask,v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) { + _mm512_mask_compressstoreu_epi64(addr,mask,reg); + } + + static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) { + return _mm512_mask_compress_epi64(dest,mask,source); + } + + static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_compress_epi64(a,mask,b); + } + + static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_expand_epi64(b,mask,a); + } + + static __forceinline vllong8 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + static __forceinline size_t extract64bit(const vllong8& v) + { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } + + __forceinline vllong8 operator +(const vllong8& a) { return a; } + __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } + __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } + __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } + + __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } + __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } + __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } + + __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } + __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } + __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } + + __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } + __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } + __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } + + __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } + __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } + __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } + + __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } + __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } + __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } + + __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } + + __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } + + __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } + __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } + __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } + + __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } + __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } + __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } + + __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } + __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } + __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } + + __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } + __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } + + __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } + __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } + __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } + + __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } + __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } + + __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } + __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } + + __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } + __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } + + __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } + __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } + + __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } + __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } + __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } + + __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } + __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } + + __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } + __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } + + __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } + __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } + + __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } + __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } + + __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } + __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } + + __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { + return _mm512_mask_or_epi64(f,m,t,t); + } + + __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) { + const vllong8 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) { + return _mm512_mask_test_epi64_mask(m,a,b); + } + + __forceinline vboold8 test(const vllong8& a, const vllong8& b) { + return _mm512_test_epi64_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return shuffle(v); + } + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vllong8 shuffle4(const vllong8& v) { + return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template + __forceinline vllong8 shuffle4(const vllong8& v) { + return shuffle4(v); + } + + template + __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { + return _mm512_alignr_epi64(a, b, i); + }; + + __forceinline long long toScalar(const vllong8& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + __forceinline vllong8 zeroExtend32Bit(const __m512i& a) { + return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } + __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } + __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } + __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } + __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { + return _mm512_permutexvar_epi64(index,v); + } + + __forceinline vllong8 reverse(const vllong8& a) { + return permute(a,vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h new file mode 100644 index 0000000000..39752611bb --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h @@ -0,0 +1,443 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 unsigned integer type */ + template<> + struct vuint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vuint16 UInt; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + unsigned int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint16& t) { v = t.v; } + __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } + + __forceinline vuint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vuint(unsigned int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vuint(const vuint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vuint(const vuint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, + unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, + unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, + unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline explicit vuint(const __m512& f) { + v = _mm512_cvtps_epu32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vuint16 loadu(const void* addr) + { + return _mm512_loadu_si512(addr); + } + + static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vuint16 load(const vuint16* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(const unsigned int* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } + + + static __forceinline void store(void* ptr, const vuint16& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vuint16& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { + _mm512_mask_storeu_epi32(ptr,mask,f); + } + + static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { + _mm512_mask_store_epi32(addr,mask,v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) { + _mm512_mask_compressstoreu_epi32(addr,mask,reg); + } + + static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) { + //_mm512_mask_compressstoreu_epi32(addr,mask,reg); + *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); + } + + static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template + static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template + static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template + static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template + static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + static __forceinline vuint16 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + static __forceinline size_t extract64bit(const vuint16& v) + { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vuint16 operator +(const vuint16& a) { return a; } + __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } + __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } + + __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } + __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } + + __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } + __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } + __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } + + __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } + __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } + + __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } + __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } + + __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } + __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } + + __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } + + __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } + + __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } + __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } + __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } + __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } + + __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } + __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } + __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } + + __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } + __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } + + __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } + __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } + + __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } + __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } + + __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } + __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } + + __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } + __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } + + __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } + __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } + __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } + + __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } + __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } + + __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } + __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } + + __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } + __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } + + __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } + __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } + + __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } + __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } + + __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + + + __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) { + const vuint16 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) { + return _mm512_mask_test_epi32_mask(m,a,b); + } + + __forceinline vboolf16 test(const vuint16& a, const vuint16& b) { + return _mm512_test_epi32_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline unsigned int toScalar(const vuint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } + __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } + __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 permute(vuint16 v, vuint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vuint16 reverse(const vuint16& a) { + return permute(a,vuint16(reverse_step)); + } + + __forceinline vuint16 prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vuint16 reverse_prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h new file mode 100644 index 0000000000..a3f393ebf2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h @@ -0,0 +1,499 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vuint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vuint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; unsigned int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint4& a) { v = a.v; } + __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } + + __forceinline vuint(const __m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + + __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} +#endif + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} + __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + +#if defined(__aarch64__) + static __forceinline vuint4 load(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vuint4 loadu(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) + static __forceinline vuint4 load(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vuint4 loadu(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + +#endif + + static __forceinline vuint4 load(const unsigned short* ptr) { +#if defined(__aarch64__) + return _mm_load4epu16_epi32(((__m128i*)ptr)); +#elif defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) { +#if defined(__aarch64__) + uint32x4_t x = uint32x4_t(v.v); + uint16x4_t y = vqmovn_u32(x); + uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); + vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0); +#elif defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(unsigned*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (uint8_t)v[i]; +#endif + } + + static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) { +#if defined(__aarch64__) + uint32x4_t x = (uint32x4_t)v.v; + uint16x4_t y = vqmovn_u32(x); + vst1_u16(ptr, y); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; +#endif + } + + static __forceinline vuint4 load_nt(void* ptr) { +#if (defined(__aarch64__)) || defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vuint4& v) { +#if !defined(__aarch64__) && defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template + static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_epi32((const int*)ptr, index, scale); +#else + return vuint4( + *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { + vuint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vuint4 operator +(const vuint4& a) { return a; } + __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } + __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } + __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } + + __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } + __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } +//#else +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +//#endif +// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } +// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } + + __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } + __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } + __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } + + __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } + __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } + __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } + + __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } + __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } + __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } + + __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } + __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } + + __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } + __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } + __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } + __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } + + __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } + __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } +// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } +//#endif + + __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } + __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } + + __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } + __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } + + __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } + __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } + __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } + + __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } + __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } + + //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } + //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } + + //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } + //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } + + //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } + //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } + + //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } + //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } + + __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } + __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } + //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } + //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } + //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } + //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vuint4 select(const vuint4& t, const vuint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +/*#if defined(__SSE4_1__) + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } + __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } + __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } + __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + +#if defined(__aarch64__) + template + __forceinline vuint4 shuffle(const vuint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vuint4 shuffle(const vuint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + template + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } +#endif +#if defined(__SSE3__) + template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template + __forceinline vuint4 shuffle(const vuint4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline unsigned int extract(const vuint4& b); + template __forceinline vuint4 insert(const vuint4& a, const unsigned b); +#elif defined(__SSE4_1__) + template __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } + template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } +#else + template __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } + template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + template<> __forceinline unsigned int extract<0>(const vuint4& b) { + return b[0]; + } + template<> __forceinline unsigned int extract<1>(const vuint4& b) { + return b[1]; + } + template<> __forceinline unsigned int extract<2>(const vuint4& b) { + return b[2]; + } + template<> __forceinline unsigned int extract<3>(const vuint4& b) { + return b[3]; + } + + template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[3] = b; + return c; + } + + __forceinline unsigned int toScalar(const vuint4& v) { + return v[0]; + } +#else + template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if 0 +#if defined(__SSE4_1__) + + __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h new file mode 100644 index 0000000000..d4e86ae92d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h @@ -0,0 +1,379 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + +#if !defined(__aarch64__) + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } +#else + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } +#endif + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vuint8 load(const uint8_t* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const uint8_t* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 load(const unsigned short* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned short* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline void store(uint8_t* ptr, const vuint8& i) { + vuint4 il(i.vl); + vuint4 ih(i.vh); + vuint4::store(ptr + 0,il); + vuint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { + return vuint8( + *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[3]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[4]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[5]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[6]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[7])); + } + + template + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { + vuint8 r = zero; + if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]); + return r; + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { + if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } + + __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) { + return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h new file mode 100644 index 0000000000..b2a965448d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h @@ -0,0 +1,439 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vuint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(uint8_t* ptr, const vuint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32((const int*) ptr, index, scale); + } + + template + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { + vuint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } +#else + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } + + __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } + + __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template + __forceinline vuint8 select(const vuint8& t, const vuint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } + //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } + //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } + //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } + //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } + //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } + //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + +#if !defined(__aarch64__) + + __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template + __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp new file mode 100644 index 0000000000..12f143f131 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp @@ -0,0 +1,327 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "intrinsics.h" +#include "sysinfo.h" +#include "mutex.h" + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + void* alignedMalloc(size_t size, size_t align) + { + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return ptr; + } + + void alignedFree(void* ptr) + { + if (ptr) + _mm_free(ptr); + } + + static bool huge_pages_enabled = false; + static MutexSys os_init_mutex; + + __forceinline bool isHugePageCandidate(const size_t bytes) + { + if (!huge_pages_enabled) + return false; + + /* use huge pages only when memory overhead is low */ + const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); + return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 + +#define WIN32_LEAN_AND_MEAN +#include +#include + +namespace embree +{ + bool win_enable_selockmemoryprivilege (bool verbose) + { + HANDLE hToken; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { + if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + TOKEN_PRIVILEGES tp; + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { + if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + SetLastError(ERROR_SUCCESS); + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; + return false; + } + + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; + return false; + } + + return true; + } + + bool os_init(bool hugepages, bool verbose) + { + Lock lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + + if (GetLargePageMinimum() != PAGE_SIZE_2M) { + huge_pages_enabled = false; + return false; + } + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { + int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + if (ptr != nullptr) { + hugepages = true; + return ptr; + } + } + + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + // -- GODOT start -- + // if (ptr == nullptr) throw std::bad_alloc(); + if (ptr == nullptr) abort(); + // -- GODOT end -- + hugepages = false; + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + if (hugepages) // decommitting huge pages seems not to work under Windows + return bytesOld; + + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) + { + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include +#include +#include +#include +#include + +#if defined(__MACOSX__) +#include +#endif + +namespace embree +{ + bool os_init(bool hugepages, bool verbose) + { + Lock lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + +#if defined(__LINUX__) + + int hugepagesize = 0; + + std::ifstream file; + file.open("/proc/meminfo",std::ios::in); + if (!file.is_open()) { + if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } + + std::string line; + while (getline(file,line)) + { + std::stringstream sline(line); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string tag; getline(sline,tag,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string val; getline(sline,val,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string unit; getline(sline,unit,' '); + if (tag == "Hugepagesize:" && unit == "kB") { + hugepagesize = std::stoi(val)*1024; + break; + } + } + + if (hugepagesize != PAGE_SIZE_2M) + { + if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } +#endif + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { +#if defined(__MACOSX__) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#elif defined(MAP_HUGETLB) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#endif + } + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + // -- GODOT start -- + // if (ptr == MAP_FAILED) throw std::bad_alloc(); + if (ptr == MAP_FAILED) abort(); + // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ + os_advise(ptr,bytes); + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + /* for hugepages we need to also align the size */ + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ + void os_advise(void* pptr, size_t bytes) + { +#if defined(MADV_HUGEPAGE) + madvise(pptr,bytes,MADV_HUGEPAGE); +#endif + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h new file mode 100644 index 0000000000..5898ecda70 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.h @@ -0,0 +1,164 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include +#include + +namespace embree +{ +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ + void* operator new[](size_t size) { return alignedMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: + + /*! aligned allocation */ + void* alignedMalloc(size_t size, size_t align); + void alignedFree(void* ptr); + + /*! allocator that performs aligned allocations */ + template + struct aligned_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return alignedFree(p); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + + /*! allocates pages directly from OS */ + bool win_enable_selockmemoryprivilege(bool verbose); + bool os_init(bool hugepages, bool verbose); + void* os_malloc (size_t bytes, bool& hugepages); + size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); + void os_free (void* ptr, size_t bytes, bool hugepages); + void os_advise (void* ptr, size_t bytes); + + /*! allocator that performs OS allocations */ + template + struct os_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline os_allocator () + : hugepages(false) {} + + __forceinline pointer allocate( size_type n ) { + return (pointer) os_malloc(n*sizeof(value_type),hugepages); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return os_free(p,n*sizeof(value_type),hugepages); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + bool hugepages; + }; + + /*! allocator for IDs */ + template + struct IDPool + { + typedef T value_type; + + IDPool () + : nextID(0) {} + + T allocate() + { + /* return ID from list */ + if (!IDs.empty()) + { + T id = *IDs.begin(); + IDs.erase(IDs.begin()); + return id; + } + + /* allocate new ID */ + else + { + if (size_t(nextID)+1 > max_id) + return -1; + + return nextID++; + } + } + + /* adds an ID provided by the user */ + bool add(T id) + { + if (id > max_id) + return false; + + /* check if ID should be in IDs set */ + if (id < nextID) { + auto p = IDs.find(id); + if (p == IDs.end()) return false; + IDs.erase(p); + return true; + } + + /* otherwise increase ID set */ + else + { + for (T i=nextID; i IDs; //!< stores deallocated IDs to be reused + T nextID; //!< next ID to use when IDs vector is empty + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h new file mode 100644 index 0000000000..77722a39f6 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/array.h @@ -0,0 +1,222 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "alloc.h" + +namespace embree +{ + /*! static array with static size */ + template + class array_t + { + public: + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+N; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return N == 0; } + __forceinline size_t size () const { return N; } + __forceinline size_t max_size () const { return N; } + + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < N); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& front() const { assert(N > 0); return items[0]; }; + __forceinline T& back () const { assert(N > 0); return items[N-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + T items[N]; + }; + + /*! static array with dynamic size */ + template + class darray_t + { + public: + + __forceinline darray_t () : M(0) {} + + __forceinline darray_t (const T& v) : M(0) { + for (size_t i=0; i 0); return items[0]; }; + __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + size_t M; + T items[N]; + }; + + /*! dynamic sized array that is allocated on the stack */ +#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray Name(N) + template + struct __aligned(64) StackArray + { + __forceinline StackArray (const size_t N) + : N(N) + { + if (N*sizeof(Ty) <= max_stack_bytes) + data = &arr[0]; + else + data = (Ty*) alignedMalloc(N*sizeof(Ty),64); + } + + __forceinline ~StackArray () { + if (data != &arr[0]) alignedFree(data); + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i=0 && i + struct __aligned(64) DynamicStackArray + { + __forceinline DynamicStackArray () + : data(&arr[0]) {} + + __forceinline ~DynamicStackArray () + { + if (!isStackAllocated()) + delete[] data; + } + + __forceinline bool isStackAllocated() const { + return data == &arr[0]; + } + + __forceinline size_t size() const + { + if (isStackAllocated()) return max_stack_elements; + else return max_total_elements; + } + + __forceinline void resize(size_t M) + { + assert(M <= max_total_elements); + if (likely(M <= max_stack_elements)) return; + if (likely(!isStackAllocated())) return; + + data = new Ty[max_total_elements]; + + for (size_t i=0; i=0 && ioperator[] (i) = other[i]; + } + + DynamicStackArray& operator= (const DynamicStackArray& other) + { + for (size_t i=0; ioperator[] (i) = other[i]; + + return *this; + } + + private: + Ty arr[max_stack_elements]; + Ty* data; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h new file mode 100644 index 0000000000..ebfb8552c3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/atomic.h @@ -0,0 +1,59 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "intrinsics.h" + +namespace embree +{ +/* compiler memory barriers */ +#if defined(__INTEL_COMPILER) +//#define __memory_barrier() __memory_barrier() +#elif defined(__GNUC__) || defined(__clang__) +# define __memory_barrier() asm volatile("" ::: "memory") +#elif defined(_MSC_VER) +# define __memory_barrier() _ReadWriteBarrier() +#endif + + template + struct atomic : public std::atomic + { + atomic () {} + + atomic (const T& a) + : std::atomic(a) {} + + atomic (const atomic& a) { + this->store(a.load()); + } + + atomic& operator=(const atomic& other) { + this->store(other.load()); + return *this; + } + }; + + template + __forceinline void atomic_min(std::atomic& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a <= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } + + template + __forceinline void atomic_max(std::atomic& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a >= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } +} diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp new file mode 100644 index 0000000000..0061d18db2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/barrier.cpp @@ -0,0 +1,289 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.h" +#include "condition.h" +#include "regression.h" +#include "thread.h" + +#if defined (__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : i(0), enterCount(0), exitCount(0), barrierSize(0) + { + events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + init(N); + } + + __forceinline ~BarrierSysImplementation () + { + CloseHandle(events[0]); + CloseHandle(events[1]); + } + + __forceinline void init(size_t N) + { + barrierSize = N; + enterCount.store(N); + exitCount.store(N); + } + + __forceinline void wait() + { + /* every thread entering the barrier decrements this count */ + size_t i0 = i; + size_t cnt0 = enterCount--; + + /* all threads except the last one are wait in the barrier */ + if (cnt0 > 1) + { + if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) + THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); + } + + /* the last thread starts all threads waiting at the barrier */ + else + { + i = 1-i; + enterCount.store(barrierSize); + if (SetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("SetEvent failed"); + } + + /* every thread leaving the barrier decrements this count */ + size_t cnt1 = exitCount--; + + /* the last thread that left the barrier resets the event again */ + if (cnt1 == 1) + { + exitCount.store(barrierSize); + if (ResetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("ResetEvent failed"); + } + } + + public: + HANDLE events[2]; + atomic i; + atomic enterCount; + atomic exitCount; + size_t barrierSize; + }; +} + +#else + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : count(0), barrierSize(0) + { + init(N); + } + + __forceinline void init(size_t N) + { + assert(count == 0); + count = 0; + barrierSize = N; + } + + __forceinline void wait() + { + mutex.lock(); + count++; + + if (count == barrierSize) { + count = 0; + cond.notify_all(); + mutex.unlock(); + return; + } + + cond.wait(mutex); + mutex.unlock(); + return; + } + + public: + MutexSys mutex; + ConditionSys cond; + volatile size_t count; + volatile size_t barrierSize; + }; +} + +#endif + +namespace embree +{ + BarrierSys::BarrierSys (size_t N) { + opaque = new BarrierSysImplementation(N); + } + + BarrierSys::~BarrierSys () { + delete (BarrierSysImplementation*) opaque; + } + + void BarrierSys::init(size_t count) { + ((BarrierSysImplementation*) opaque)->init(count); + } + + void BarrierSys::wait() { + ((BarrierSysImplementation*) opaque)->wait(); + } + + LinearBarrierActive::LinearBarrierActive (size_t N) + : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) + { + if (N == 0) N = getNumberOfLogicalThreads(); + init(N); + } + + LinearBarrierActive::~LinearBarrierActive() + { + delete[] count0; + delete[] count1; + } + + void LinearBarrierActive::init(size_t N) + { + if (threadCount != N) { + threadCount = N; + if (count0) delete[] count0; count0 = new unsigned char[N]; + if (count1) delete[] count1; count1 = new unsigned char[N]; + } + mode = 0; + flag0 = 0; + flag1 = 0; + for (size_t i=0; i threadID; + std::atomic numFailed; + std::vector threadResults; + + barrier_sys_regression_test() + : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(barrier_sys_regression_test* This) + { + size_t tid = This->threadID++; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + This->threadResults[tid] = tid; + This->barrier.wait(); + } + } + + bool run () + { + threadID.store(0); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + threadResults.resize(numThreads); + barrier.init(numThreads+1); + + /* create threads */ + std::vector threads; + for (size_t i=0; i cntr; + }; + + /*! fast active barrier that does not require initialization to some number of threads */ + struct BarrierActiveAutoReset + { + public: + BarrierActiveAutoReset () + : cntr0(0), cntr1(0) {} + + void wait (size_t threadCount) + { + cntr0.fetch_add(1); + while (cntr0 != threadCount) pause_cpu(); + cntr1.fetch_add(1); + while (cntr1 != threadCount) pause_cpu(); + cntr0.fetch_add(-1); + while (cntr0 != 0) pause_cpu(); + cntr1.fetch_add(-1); + while (cntr1 != 0) pause_cpu(); + } + + private: + std::atomic cntr0; + std::atomic cntr1; + }; + + class LinearBarrierActive + { + public: + + /*! construction and destruction */ + LinearBarrierActive (size_t threadCount = 0); + ~LinearBarrierActive(); + + private: + /*! class in non-copyable */ + LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement + LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t threadCount); + + /*! thread with threadIndex waits in the barrier */ + void wait (const size_t threadIndex); + + private: + volatile unsigned char* count0; + volatile unsigned char* count1; + volatile unsigned int mode; + volatile unsigned int flag0; + volatile unsigned int flag1; + volatile size_t threadCount; + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp new file mode 100644 index 0000000000..0e7ca7af39 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.cpp @@ -0,0 +1,81 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "condition.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + InitializeConditionVariable(&cond); + } + + __forceinline ~ConditionImplementation () { + } + + __forceinline void wait(MutexSys& mutex_in) { + SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); + } + + __forceinline void notify_all() { + WakeAllConditionVariable(&cond); + } + + public: + CONDITION_VARIABLE cond; + }; +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + pthread_cond_init(&cond,nullptr); + } + + __forceinline ~ConditionImplementation() { + pthread_cond_destroy(&cond); + } + + __forceinline void wait(MutexSys& mutex) { + pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); + } + + __forceinline void notify_all() { + pthread_cond_broadcast(&cond); + } + + public: + pthread_cond_t cond; + }; +} +#endif + +namespace embree +{ + ConditionSys::ConditionSys () { + cond = new ConditionImplementation; + } + + ConditionSys::~ConditionSys() { + delete (ConditionImplementation*) cond; + } + + void ConditionSys::wait(MutexSys& mutex) { + ((ConditionImplementation*) cond)->wait(mutex); + } + + void ConditionSys::notify_all() { + ((ConditionImplementation*) cond)->notify_all(); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h new file mode 100644 index 0000000000..7a3a05aa81 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.h @@ -0,0 +1,31 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mutex.h" + +namespace embree +{ + class ConditionSys + { + public: + ConditionSys(); + ~ConditionSys(); + void wait( class MutexSys& mutex ); + void notify_all(); + + template + __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) + { + while (!pred()) wait(mutex); + } + + private: + ConditionSys (const ConditionSys& other) DELETED; // do not implement + ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement + + protected: + void* cond; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp new file mode 100644 index 0000000000..86182c1afb --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/filename.cpp @@ -0,0 +1,138 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "filename.h" +#include "sysinfo.h" + +namespace embree +{ +#ifdef __WIN32__ + const char path_sep = '\\'; +#else + const char path_sep = '/'; +#endif + + /*! create an empty filename */ + FileName::FileName () {} + + /*! create a valid filename from a string */ + FileName::FileName (const char* in) { + filename = in; + for (size_t i=0; i +#endif + +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#if defined(NEON_AVX2_EMULATION) +#include "../math/AVX2NEON.h" +#endif +#else +#include +#endif + +#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) + #if !defined(_tzcnt_u32) + #define _tzcnt_u32 __tzcnt_u32 + #endif + #if !defined(_tzcnt_u64) + #define _tzcnt_u64 __tzcnt_u64 + #endif +#endif + +#if defined(__aarch64__) +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clz +#endif +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clzll +#endif +#else +#if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif +#endif +#endif + +#if defined(__WIN32__) +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +/* normally defined in pmmintrin.h, but we always need this */ +#if !defined(_MM_SET_DENORMALS_ZERO_MODE) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) +#define _MM_DENORMALS_ZERO_MASK (0x0040) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +#endif + +namespace embree +{ + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + + __forceinline size_t read_tsc() + { + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + return (size_t)li.QuadPart; + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + + __forceinline unsigned bsf(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif + } +#endif + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + + __forceinline unsigned bscf(unsigned& v) + { + unsigned i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) + return 63 -_lzcnt_u64(v); +#else + unsigned long r = 0; _BitScanReverse64(&r, v); return r; +#endif + } +#endif + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline int btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; + } + + __forceinline int bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; + } + + __forceinline int btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; + } + +#if defined(__X86_64__) + + __forceinline size_t btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; + } + + __forceinline size_t bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; + } + + __forceinline size_t btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; + } + +#endif + + __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { + return _InterlockedCompareExchange((volatile long*)p,v,c); + } + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#else + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#else + + __forceinline void __cpuid(int out[4], int op) { +#if defined(__ARM_NEON) + if (op == 0) { // Get CPU name + out[0] = 0x41524d20; + out[1] = 0x41524d20; + out[2] = 0x41524d20; + out[3] = 0x41524d20; + } +#else + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); +#endif + } + +#if !defined(__ARM_NEON) + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } +#endif + +#endif + + __forceinline uint64_t read_tsc() { +#if defined(__ARM_NEON) + return 0; // FIXME(LTE): mimic rdtsc +#else + uint32_t high,low; + asm volatile ("rdtsc" : "=d"(high), "=a"(low)); + return (((uint64_t)high) << 32) + (uint64_t)low; +#endif + } + + __forceinline int bsf(int v) { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsf(unsigned v) + { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } +#endif + + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif +#elif defined(__ARM_NEON) + return __builtin_ctzl(v); +#else + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned int bscf(unsigned int& v) + { + unsigned int i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } +#endif + + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return 63 - _lzcnt_u64(v); +#else + return 31 - _lzcnt_u32(v); +#endif +#elif defined(__aarch64__) + return (sizeof(v) * 8 - 1) - __builtin_clzl(v); +#else + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline size_t blsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__INTEL_COMPILER) + return _blsr_u64(v); +#else +#if defined(__X86_64__) + return __blsr_u64(v); +#else + return __blsr_u32(v); +#endif +#endif +#else + return v & (v-1); +#endif + } + + __forceinline int btc(int v, int i) { +#if defined(__aarch64__) + // _bittestandcomplement(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a ^ (1 << b); + // return x; + + // We only need `*a` + return (v ^ (1 << i)); +#else + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline int bts(int v, int i) { +#if defined(__aarch64__) + // _bittestandset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a | (1 << b); + // return x; + return (v | (v << i)); +#else + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int btr(int v, int i) { +#if defined(__aarch64__) + // _bittestandreset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a & ~(1 << b); + // return x; + return (v & ~(v << i)); +#else + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btc(size_t v, size_t i) { +#if defined(__aarch64__) + return (v ^ (1 << i)); +#else + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline size_t bts(size_t v, size_t i) { +#if defined(__aarch64__) + return (v | (v << i)); +#else + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btr(size_t v, size_t i) { +#if defined(__ARM_NEON) + return (v & ~(v << i)); +#else + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { + return __sync_val_compare_and_swap(value, comparand, input); + } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(_mm_undefined_ps) + __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } +#endif +#if !defined(_mm_undefined_si128) + __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } +#endif +#if !defined(_mm256_undefined_ps) && defined(__AVX__) + __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } +#endif +#if !defined(_mm256_undefined_si256) && defined(__AVX__) + __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } +#endif +#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) + __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } +#endif +#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) + __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } +#endif +#endif + +#if defined(__SSE4_2__) || defined(__ARM_NEON) + + __forceinline int popcnt(int in) { + return _mm_popcnt_u32(in); + } + + __forceinline unsigned popcnt(unsigned in) { + return _mm_popcnt_u32(in); + } + +#if defined(__X86_64__) || defined(__ARM_NEON) + __forceinline size_t popcnt(size_t in) { + return _mm_popcnt_u64(in); + } +#endif + +#endif + + __forceinline uint64_t rdtsc() + { + int dummy[4]; + __cpuid(dummy,0); + uint64_t clock = read_tsc(); + __cpuid(dummy,0); + return clock; + } + + __forceinline void pause_cpu(const size_t N = 8) + { + for (size_t i=0; i + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { + std::string fullName = file+".dll"; + FileName executable = getExecutableFileName(); + HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); + return lib_t(handle); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return reinterpret_cast(GetProcAddress(HMODULE(lib),sym.c_str())); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + FreeLibrary(HMODULE(lib)); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { +#if defined(__MACOSX__) + std::string fullName = "lib"+file+".dylib"; +#else + std::string fullName = "lib"+file+".so"; +#endif + void* lib = dlopen(fullName.c_str(), RTLD_NOW); + if (lib) return lib_t(lib); + FileName executable = getExecutableFileName(); + lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); + if (lib == nullptr) { + const char* error = dlerror(); + if (error) { + THROW_RUNTIME_ERROR(error); + } else { + THROW_RUNTIME_ERROR("could not load library "+executable.str()); + } + } + return lib_t(lib); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return dlsym(lib,sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + dlclose(lib); + } +} +#endif diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h new file mode 100644 index 0000000000..c2164e9fbe --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/library.h @@ -0,0 +1,21 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! type for shared library */ + typedef struct opaque_lib_t* lib_t; + + /*! loads a shared library */ + lib_t openLibrary(const std::string& file); + + /*! returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym); + + /*! unloads a shared library */ + void closeLibrary(lib_t lib); +} diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp new file mode 100644 index 0000000000..11779bc9b9 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.cpp @@ -0,0 +1,58 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "mutex.h" +#include "regression.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } + MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } + void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } + bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } + void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include +namespace embree +{ + /*! system mutex using pthreads */ + MutexSys::MutexSys() + { + mutex = new pthread_mutex_t; + if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_init failed"); + } + + MutexSys::~MutexSys() + { + MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; + assert(ok); + delete (pthread_mutex_t*)mutex; + mutex = nullptr; + } + + void MutexSys::lock() + { + if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); + } + + bool MutexSys::try_lock() { + return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; + } + + void MutexSys::unlock() + { + if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); + } +}; +#endif diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h new file mode 100644 index 0000000000..1164210f23 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.h @@ -0,0 +1,98 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "intrinsics.h" +#include "atomic.h" + +namespace embree +{ + /*! system mutex */ + class MutexSys { + friend struct ConditionImplementation; + public: + MutexSys(); + ~MutexSys(); + + private: + MutexSys (const MutexSys& other) DELETED; // do not implement + MutexSys& operator= (const MutexSys& other) DELETED; // do not implement + + public: + void lock(); + bool try_lock(); + void unlock(); + + protected: + void* mutex; + }; + + /*! spinning mutex */ + class SpinLock + { + public: + + SpinLock () + : flag(false) {} + + __forceinline bool isLocked() { + return flag.load(); + } + + __forceinline void lock() + { + while (true) + { + while (flag.load()) + { + _mm_pause(); + _mm_pause(); + } + + bool expected = false; + if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) + break; + } + } + + __forceinline bool try_lock() + { + bool expected = false; + if (flag.load() != expected) { + return false; + } + return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); + } + + __forceinline void unlock() { + flag.store(false,std::memory_order_release); + } + + __forceinline void wait_until_unlocked() + { + while(flag.load()) + { + _mm_pause(); + _mm_pause(); + } + } + + public: + atomic flag; + }; + + /*! safe mutex lock and unlock helper */ + template class Lock { + public: + Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } + Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} + ~Lock() { if (locked) mutex.unlock(); } + __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } + __forceinline bool isLocked() const { return locked; } + protected: + Mutex& mutex; + bool locked; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h new file mode 100644 index 0000000000..737f14aa6e --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/platform.h @@ -0,0 +1,387 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define _CRT_SECURE_NO_WARNINGS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +/// detect platform +//////////////////////////////////////////////////////////////////////////////// + +/* detect 32 or 64 platform */ +#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __X86_64__ +#endif + +/* detect Linux platform */ +#if defined(linux) || defined(__linux__) || defined(__LINUX__) +# if !defined(__LINUX__) +# define __LINUX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect FreeBSD platform */ +#if defined(__FreeBSD__) || defined(__FREEBSD__) +# if !defined(__FREEBSD__) +# define __FREEBSD__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ +#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) +# if !defined(__WIN32__) +# define __WIN32__ +# endif +#endif + +/* detect Cygwin platform */ +#if defined(__CYGWIN__) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect MAC OS X platform */ +#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) +# if !defined(__MACOSX__) +# define __MACOSX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* try to detect other Unix systems */ +#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Macros +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __WIN32__ +#define dll_export __declspec(dllexport) +#define dll_import __declspec(dllimport) +#else +#define dll_export __attribute__ ((visibility ("default"))) +#define dll_import +#endif + +#ifdef __WIN32__ +#if !defined(__noinline) +#define __noinline __declspec(noinline) +#endif +//#define __forceinline __forceinline +//#define __restrict __restrict +#if defined(__INTEL_COMPILER) +#define __restrict__ __restrict +#else +#define __restrict__ //__restrict // causes issues with MSVC +#endif +#if !defined(__thread) +// NOTE: Require `-fms-extensions` for clang +#define __thread __declspec(thread) +#endif +#if !defined(__aligned) +#if defined(__MINGW32__) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#else +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#endif +#endif +//#define __FUNCTION__ __FUNCTION__ +#define debugbreak() __debugbreak() + +#else +#if !defined(__noinline) +#define __noinline __attribute__((noinline)) +#endif +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif +//#define __restrict __restrict +//#define __thread __thread +#if !defined(__aligned) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif +#if !defined(__FUNCTION__) +#define __FUNCTION__ __PRETTY_FUNCTION__ +#endif +#define debugbreak() asm ("int $3") +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define MAYBE_UNUSED __attribute__((unused)) +#else + #define MAYBE_UNUSED +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly + #define DELETED +#else + #define DELETED = delete +#endif + +// -- GODOT start -- +#ifndef likely +// -- GODOT end -- +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define likely(expr) (expr) +#define unlikely(expr) (expr) +#else +#define likely(expr) __builtin_expect((bool)(expr),true ) +#define unlikely(expr) __builtin_expect((bool)(expr),false) +#endif +// -- GODOT start -- +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// Error handling and debugging +//////////////////////////////////////////////////////////////////////////////// + +/* debug printing macros */ +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl +#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ + abort(); + // -- GODOT end -- +#endif + +#define FATAL(x) THROW_RUNTIME_ERROR(x) +#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } + +#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") + +//////////////////////////////////////////////////////////////////////////////// +/// Basic types +//////////////////////////////////////////////////////////////////////////////// + +/* default floating-point type */ +namespace embree { + typedef float real; +} + +/* windows does not have ssize_t */ +#if defined(__WIN32__) +#if defined(__X86_64__) || defined(__aarch64__) +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Basic utility functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline std::string toString(long long value) { + return std::to_string(value); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Disable some compiler warnings +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__INTEL_COMPILER) +//#pragma warning(disable:265 ) // floating-point operation result is out of range +//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used +//#pragma warning(disable:869 ) // parameter was never referenced +//#pragma warning(disable:981 ) // operands are evaluated in unspecified order +//#pragma warning(disable:1418) // external function definition with no prior declaration +//#pragma warning(disable:1419) // external declaration in primary source file +//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable +//#pragma warning(disable:94 ) // the size of an array must be greater than zero +//#pragma warning(disable:1599) // declaration hides parameter +//#pragma warning(disable:424 ) // extra ";" ignored +#pragma warning(disable:2196) // routine is both "inline" and "noinline" +//#pragma warning(disable:177 ) // label was declared but never referenced +//#pragma warning(disable:114 ) // function was referenced but not defined +//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function +#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient +#endif + +#if defined(_MSC_VER) +//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union +#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) +//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data +#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +//#pragma warning(disable:4355) // 'this' : used in base member initializer list +//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch +//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch +//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' +//#pragma warning(disable:4068) // unknown pragma +//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) +//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored +#pragma warning(disable:4503) // decorated name length exceeded, name was truncated +#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored +#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used + +# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) +# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings +# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 +# endif + +#endif + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +//#pragma clang diagnostic ignored "-Wunknown-pragmas" +//#pragma clang diagnostic ignored "-Wunused-variable" +//#pragma clang diagnostic ignored "-Wreorder" +//#pragma clang diagnostic ignored "-Wmicrosoft" +//#pragma clang diagnostic ignored "-Wunused-private-field" +//#pragma clang diagnostic ignored "-Wunused-local-typedef" +//#pragma clang diagnostic ignored "-Wunused-function" +//#pragma clang diagnostic ignored "-Wnarrowing" +//#pragma clang diagnostic ignored "-Wc++11-narrowing" +//#pragma clang diagnostic ignored "-Wdeprecated-register" +//#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wpragmas" +//#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +//#pragma GCC diagnostic ignored "-Warray-bounds" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wparentheses" +#endif + +#if defined(__clang__) && defined(__WIN32__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmicrosoft-cast" +#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" +#pragma clang diagnostic ignored "-Wmicrosoft-include" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunknown-pragmas" +#endif + +/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ +#if defined(__WIN32__) && defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated +#elif defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated +#elif defined(__clang__) +#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(__GNUC__) +#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(_MSC_VER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated +#endif + +/* embree output stream */ +#define embree_ostream std::ostream& +#define embree_cout std::cout +#define embree_cout_uniform std::cout +#define embree_endl std::endl + +//////////////////////////////////////////////////////////////////////////////// +/// Some macros for static profiling +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +namespace embree +{ + template + struct OnScopeExitHelper + { + OnScopeExitHelper (const Closure f) : active(true), f(f) {} + ~OnScopeExitHelper() { if (active) f(); } + void deactivate() { active = false; } + bool active; + const Closure f; + }; + + template + OnScopeExitHelper OnScopeExit(const Closure f) { + return OnScopeExitHelper(f); + } + +#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) +#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define ON_SCOPE_EXIT(code) \ + auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) + + template + std::unique_ptr make_unique(Ty* ptr) { + return std::unique_ptr(ptr); + } + +} diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h new file mode 100644 index 0000000000..24648e6234 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/ref.h @@ -0,0 +1,122 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "atomic.h" + +namespace embree +{ + struct NullTy { + }; + + extern MAYBE_UNUSED NullTy null; + + class RefCount + { + public: + RefCount(int val = 0) : refCounter(val) {} + virtual ~RefCount() {}; + + virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } + virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } + private: + std::atomic refCounter; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Reference to single object + //////////////////////////////////////////////////////////////////////////////// + + template + class Ref + { + public: + Type* ptr; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Ref() : ptr(nullptr) {} + __forceinline Ref(NullTy) : ptr(nullptr) {} + __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } + __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } + + __forceinline Ref(Type* const input) : ptr(input) + { + if (ptr) + ptr->refInc(); + } + + __forceinline ~Ref() + { + if (ptr) + ptr->refDec(); + } + + __forceinline Ref& operator =(const Ref& input) + { + if (input.ptr) + input.ptr->refInc(); + if (ptr) + ptr->refDec(); + ptr = input.ptr; + return *this; + } + + __forceinline Ref& operator =(Ref&& input) + { + if (ptr) + ptr->refDec(); + ptr = input.ptr; + input.ptr = nullptr; + return *this; + } + + __forceinline Ref& operator =(Type* const input) + { + if (input) + input->refInc(); + if (ptr) + ptr->refDec(); + ptr = input; + return *this; + } + + __forceinline Ref& operator =(NullTy) + { + if (ptr) + ptr->refDec(); + ptr = nullptr; + return *this; + } + + __forceinline operator bool() const { return ptr != nullptr; } + + __forceinline const Type& operator *() const { return *ptr; } + __forceinline Type& operator *() { return *ptr; } + __forceinline const Type* operator ->() const { return ptr; } + __forceinline Type* operator ->() { return ptr; } + + template + __forceinline Ref cast() { return Ref(static_cast(ptr)); } + template + __forceinline const Ref cast() const { return Ref(static_cast(ptr)); } + + template + __forceinline Ref dynamicCast() { return Ref(dynamic_cast(ptr)); } + template + __forceinline const Ref dynamicCast() const { return Ref(dynamic_cast(ptr)); } + }; + + template __forceinline bool operator < (const Ref& a, const Ref& b) { return a.ptr < b.ptr; } + + template __forceinline bool operator ==(const Ref& a, NullTy ) { return a.ptr == nullptr; } + template __forceinline bool operator ==(NullTy , const Ref& b) { return nullptr == b.ptr; } + template __forceinline bool operator ==(const Ref& a, const Ref& b) { return a.ptr == b.ptr; } + + template __forceinline bool operator !=(const Ref& a, NullTy ) { return a.ptr != nullptr; } + template __forceinline bool operator !=(NullTy , const Ref& b) { return nullptr != b.ptr; } + template __forceinline bool operator !=(const Ref& a, const Ref& b) { return a.ptr != b.ptr; } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp new file mode 100644 index 0000000000..d95ff8dfe0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.cpp @@ -0,0 +1,30 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "regression.h" + +namespace embree +{ + /* registerRegressionTest is invoked from static initializers, thus + * we cannot have the regression_tests variable as global static + * variable due to issues with static variable initialization + * order. */ + std::vector& get_regression_tests() + { + static std::vector regression_tests; + return regression_tests; + } + + void registerRegressionTest(RegressionTest* test) + { + get_regression_tests().push_back(test); + } + + RegressionTest* getRegressionTest(size_t index) + { + if (index >= get_regression_tests().size()) + return nullptr; + + return get_regression_tests()[index]; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h new file mode 100644 index 0000000000..632f8d92cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.h @@ -0,0 +1,25 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#include + +namespace embree +{ + /*! virtual interface for all regression tests */ + struct RegressionTest + { + RegressionTest (std::string name) : name(name) {} + virtual bool run() = 0; + std::string name; + }; + + /*! registers a regression test */ + void registerRegressionTest(RegressionTest* test); + + /*! run all regression tests */ + RegressionTest* getRegressionTest(size_t index); +} diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp new file mode 100644 index 0000000000..931244383e --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.cpp @@ -0,0 +1,42 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "string.h" + +#include +#include + +namespace embree +{ + char to_lower(char c) { return char(tolower(int(c))); } + char to_upper(char c) { return char(toupper(int(c))); } + std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } + std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } + + Vec2f string_to_Vec2f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); + return Vec2f(x,y); + } + + Vec3f string_to_Vec3f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); + return Vec3f(x,y,z); + } + + Vec4f string_to_Vec4f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); str = str.substr(next+1); + const float w = std::stof(str,&next); + return Vec4f(x,y,z,w); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h new file mode 100644 index 0000000000..2e9b0f88c3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/vec4.h" + +namespace embree +{ + class IOStreamStateRestorer + { + public: + IOStreamStateRestorer(std::ostream& iostream) + : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { + } + + ~IOStreamStateRestorer() { + iostream.flags(flags); + iostream.precision(precision); + } + + private: + std::ostream& iostream; + std::ios::fmtflags flags; + std::streamsize precision; + }; + + std::string toLowerCase(const std::string& s); + std::string toUpperCase(const std::string& s); + + Vec2f string_to_Vec2f ( std::string str ); + Vec3f string_to_Vec3f ( std::string str ); + Vec4f string_to_Vec4f ( std::string str ); +} diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp new file mode 100644 index 0000000000..1d11436770 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp @@ -0,0 +1,676 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sysinfo.h" +#include "intrinsics.h" +#include "string.h" +#include "ref.h" +#if defined(__FREEBSD__) +#include +#include +typedef cpuset_t cpu_set_t; +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + NullTy null; + + std::string getPlatformName() + { +#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON) + return "Android Linux (aarch64 / arm64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__) + return "Android Linux (x64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86)) + return "Android Linux (x86)"; +#elif defined(__LINUX__) && !defined(__X86_64__) + return "Linux (32bit)"; +#elif defined(__LINUX__) && defined(__X86_64__) + return "Linux (64bit)"; +#elif defined(__FREEBSD__) && !defined(__X86_64__) + return "FreeBSD (32bit)"; +#elif defined(__FREEBSD__) && defined(__X86_64__) + return "FreeBSD (64bit)"; +#elif defined(__CYGWIN__) && !defined(__X86_64__) + return "Cygwin (32bit)"; +#elif defined(__CYGWIN__) && defined(__X86_64__) + return "Cygwin (64bit)"; +#elif defined(__WIN32__) && !defined(__X86_64__) + return "Windows (32bit)"; +#elif defined(__WIN32__) && defined(__X86_64__) + return "Windows (64bit)"; +#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__) + return "iOS Simulator (x64)"; +#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON) + return "iOS (aarch64 / arm64)"; +#elif defined(__MACOSX__) && !defined(__X86_64__) + return "Mac OS X (32bit)"; +#elif defined(__MACOSX__) && defined(__X86_64__) + return "Mac OS X (64bit)"; +#elif defined(__UNIX__) && defined(__aarch64__) + return "Unix (aarch64)"; +#elif defined(__UNIX__) && !defined(__X86_64__) + return "Unix (32bit)"; +#elif defined(__UNIX__) && defined(__X86_64__) + return "Unix (64bit)"; +#else + return "Unknown"; +#endif + } + + std::string getCompilerName() + { +#if defined(__INTEL_COMPILER) + int icc_mayor = __INTEL_COMPILER / 100 % 100; + int icc_minor = __INTEL_COMPILER % 100; + std::string version = "Intel Compiler "; + version += toString(icc_mayor); + version += "." + toString(icc_minor); +#if defined(__INTEL_COMPILER_UPDATE) + version += "." + toString(__INTEL_COMPILER_UPDATE); +#endif + return version; +#elif defined(__clang__) + return "CLANG " __clang_version__; +#elif defined (__GNUC__) + return "GCC " __VERSION__; +#elif defined(_MSC_VER) + std::string version = toString(_MSC_FULL_VER); + version.insert(4,"."); + version.insert(9,"."); + version.insert(2,"."); + return "Visual C++ Compiler " + version; +#else + return "Unknown Compiler"; +#endif + } + + std::string getCPUVendor() + { + int cpuinfo[4]; + __cpuid (cpuinfo, 0); + int name[4]; + name[0] = cpuinfo[1]; + name[1] = cpuinfo[3]; + name[2] = cpuinfo[2]; + name[3] = 0; + return (char*)name; + } + + CPU getCPUModel() + { + if (getCPUVendor() != "GenuineIntel") + return CPU::UNKNOWN; + + int out[4]; + __cpuid(out, 0); + if (out[0] < 1) return CPU::UNKNOWN; + __cpuid(out, 1); + + /* please see CPUID documentation for these formulas */ + uint32_t family_ID = (out[0] >> 8) & 0x0F; + uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; + + uint32_t model_ID = (out[0] >> 4) & 0x0F; + uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; + + uint32_t DisplayFamily = family_ID; + if (family_ID == 0x0F) + DisplayFamily += extended_family_ID; + + uint32_t DisplayModel = model_ID; + if (family_ID == 0x06 || family_ID == 0x0F) + DisplayModel += extended_model_ID << 4; + + uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); + + // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) + if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; + if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; + if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; + if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; + + if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; + if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; + + return CPU::UNKNOWN; + } + + std::string stringOfCPUModel(CPU model) + { + switch (model) { + case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; + case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; + case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; + case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; + case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; + case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; + case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; + case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; + case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; + case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; + case CPU::XEON_BROADWELL : return "Xeon Broadwell"; + case CPU::CORE_BROADWELL : return "Core Broadwell"; + case CPU::XEON_HASWELL : return "Xeon Haswell"; + case CPU::CORE_HASWELL : return "Core Haswell"; + case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; + case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; + case CPU::SANDY_BRIDGE : return "Sandy Bridge"; + case CPU::NEHALEM : return "Nehalem"; + case CPU::CORE2 : return "Core2"; + case CPU::CORE1 : return "Core"; + case CPU::ARM : return "Arm"; + case CPU::UNKNOWN : return "Unknown CPU"; + } + return "Unknown CPU (error)"; + } + +#if !defined(__ARM_NEON) + /* constants to access destination registers of CPUID instruction */ + static const int EAX = 0; + static const int EBX = 1; + static const int ECX = 2; + static const int EDX = 3; + + /* cpuid[eax=1].ecx */ + static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; + static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; + static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; + static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; + static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; + //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; + static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; + //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; + static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; + static const int CPU_FEATURE_BIT_AVX = 1 << 28; + static const int CPU_FEATURE_BIT_F16C = 1 << 29; + static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; + + /* cpuid[eax=1].edx */ + static const int CPU_FEATURE_BIT_SSE = 1 << 25; + static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; + + /* cpuid[eax=0x80000001].ecx */ + static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; + + /* cpuid[eax=7,ecx=0].ebx */ + static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; + static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; + static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; + static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) + static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) + static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) + static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) + static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) + static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) + static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) + static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) + + /* cpuid[eax=7,ecx=0].ecx */ + static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) +#endif + +#if !defined(__ARM_NEON) + __noinline int64_t get_xcr0() + { + // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466 +#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK) + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +#else + int xcr0 = 0; + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); + return xcr0; +#endif + } +#endif + + int getCPUFeatures() + { +#if defined(__ARM_NEON) + int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; +#if defined(NEON_AVX2_EMULATION) + cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; + cpu_features |= CPU_FEATURE_XMM_ENABLED; + cpu_features |= CPU_FEATURE_YMM_ENABLED; + cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; + cpu_features |= CPU_FEATURE_POPCNT; + cpu_features |= CPU_FEATURE_AVX; + cpu_features |= CPU_FEATURE_AVX2; + cpu_features |= CPU_FEATURE_FMA3; + cpu_features |= CPU_FEATURE_LZCNT; + cpu_features |= CPU_FEATURE_BMI1; + cpu_features |= CPU_FEATURE_BMI2; + cpu_features |= CPU_FEATURE_NEON_2X; + + + +#endif + return cpu_features; + +#else + /* cache CPU features access */ + static int cpu_features = 0; + if (cpu_features) + return cpu_features; + + /* get number of CPUID leaves */ + int cpuid_leaf0[4]; + __cpuid(cpuid_leaf0, 0x00000000); + unsigned nIds = cpuid_leaf0[EAX]; + + /* get number of extended CPUID leaves */ + int cpuid_leafe[4]; + __cpuid(cpuid_leafe, 0x80000000); + unsigned nExIds = cpuid_leafe[EAX]; + + /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ + int cpuid_leaf_1[4] = { 0,0,0,0 }; + int cpuid_leaf_7[4] = { 0,0,0,0 }; + int cpuid_leaf_e1[4] = { 0,0,0,0 }; + if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); +#if _WIN32 +#if _MSC_VER && (_MSC_FULL_VER < 160040219) +#else + if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); +#endif +#else + if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); +#endif + if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); + + /* detect if OS saves XMM, YMM, and ZMM states */ + bool xmm_enabled = true; + bool ymm_enabled = false; + bool zmm_enabled = false; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { + int64_t xcr0 = get_xcr0(); + xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ + ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ + zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ + } + if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; + if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; + if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; + + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; + + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; + if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; + + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; + if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; + + return cpu_features; +#endif + } + + std::string stringOfCPUFeatures(int features) + { + std::string str; + if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; + if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; + if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; + if (features & CPU_FEATURE_SSE ) str += "SSE "; + if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; + if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; + if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; + if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; + if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; + if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; + if (features & CPU_FEATURE_AVX ) str += "AVX "; + if (features & CPU_FEATURE_F16C ) str += "F16C "; + if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; + if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; + if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; + if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; + if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; + if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; + if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; + if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; + if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; + if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; + if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; + if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; + if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; + if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; + if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + if (features & CPU_FEATURE_NEON) str += "NEON "; + if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; + return str; + } + + std::string stringOfISA (int isa) + { + if (isa == SSE) return "SSE"; + if (isa == SSE2) return "SSE2"; + if (isa == SSE3) return "SSE3"; + if (isa == SSSE3) return "SSSE3"; + if (isa == SSE41) return "SSE4.1"; + if (isa == SSE42) return "SSE4.2"; + if (isa == AVX) return "AVX"; + if (isa == AVX2) return "AVX2"; + if (isa == AVX512KNL) return "AVX512KNL"; + if (isa == AVX512SKX) return "AVX512SKX"; + if (isa == NEON) return "NEON"; + if (isa == NEON_2X) return "2xNEON"; + return "UNKNOWN"; + } + + bool hasISA(int features, int isa) { + return (features & isa) == isa; + } + + std::string supportedTargetList (int features) + { + std::string v; + if (hasISA(features,SSE)) v += "SSE "; + if (hasISA(features,SSE2)) v += "SSE2 "; + if (hasISA(features,SSE3)) v += "SSE3 "; + if (hasISA(features,SSSE3)) v += "SSSE3 "; + if (hasISA(features,SSE41)) v += "SSE4.1 "; + if (hasISA(features,SSE42)) v += "SSE4.2 "; + if (hasISA(features,AVX)) v += "AVX "; + if (hasISA(features,AVXI)) v += "AVXI "; + if (hasISA(features,AVX2)) v += "AVX2 "; + if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; + if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; + if (hasISA(features,NEON)) v += "NEON "; + if (hasISA(features,NEON_2X)) v += "2xNEON "; + return v; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include +#include + +namespace embree +{ + std::string getExecutableFileName() { + char filename[1024]; + if (!GetModuleFileName(nullptr, filename, sizeof(filename))) + return std::string(); + return std::string(filename); + } + + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); + + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0; + for (int i = 0; i < groups; i++) + totalProcessors += pGetActiveProcessorCount(i); + nThreads = totalProcessors; + } + else + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + nThreads = sysinfo.dwNumberOfProcessors; + } + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); + if (handle == INVALID_HANDLE_VALUE) return 80; + CONSOLE_SCREEN_BUFFER_INFO info; + memset(&info,0,sizeof(info)); + GetConsoleScreenBufferInfo(handle, &info); + return info.dwSize.X; + } + + double getSeconds() + { + LARGE_INTEGER freq, val; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&val); + return (double)val.QuadPart / (double)freq.QuadPart; + } + + void sleepSeconds(double t) { + Sleep(DWORD(1000.0*t)); + } + + size_t getVirtualMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.QuotaPeakPagedPoolUsage; + } + + size_t getResidentMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include +#include + +namespace embree +{ + std::string getExecutableFileName() + { + std::string pid = "/proc/" + toString(getpid()) + "/exe"; + char buf[4096]; + memset(buf,0,sizeof(buf)); + if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return virt*sysconf(_SC_PAGE_SIZE); + } + + size_t getResidentMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return resident*sysconf(_SC_PAGE_SIZE); + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__FreeBSD__) + +#include + +namespace embree +{ + std::string getExecutableFileName() + { + const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; + char buf[4096]; + memset(buf,0,sizeof(buf)); + size_t len = sizeof(buf)-1; + if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Mac OS X Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include + +namespace embree +{ + std::string getExecutableFileName() + { + char buf[4096]; + uint32_t size = sizeof(buf); + if (_NSGetExecutablePath(buf, &size) != 0) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include +#include +#include +#include + +namespace embree +{ + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +#if defined(__MACOSX__) || defined(__ANDROID__) + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); +#else + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + nThreads = CPU_COUNT(&set); +#endif + + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + struct winsize info; + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; + return info.ws_col; + } + + double getSeconds() { + struct timeval tp; gettimeofday(&tp,nullptr); + return double(tp.tv_sec) + double(tp.tv_usec)/1E6; + } + + void sleepSeconds(double t) { + usleep(1000000.0*t); + } +} +#endif + diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h new file mode 100644 index 0000000000..8e313a59b3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.h @@ -0,0 +1,192 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define CACHELINE_SIZE 64 + +#if !defined(PAGE_SIZE) + #define PAGE_SIZE 4096 +#endif + +#define PAGE_SIZE_2M (2*1024*1024) +#define PAGE_SIZE_4K (4*1024) + +#include "platform.h" + +/* define isa namespace and ISA bitvector */ +#if defined (__AVX512VL__) +# define isa avx512skx +# define ISA AVX512SKX +# define ISA_STR "AVX512SKX" +#elif defined (__AVX512F__) +# define isa avx512knl +# define ISA AVX512KNL +# define ISA_STR "AVX512KNL" +#elif defined (__AVX2__) +# define isa avx2 +# define ISA AVX2 +# define ISA_STR "AVX2" +#elif defined(__AVXI__) +# define isa avxi +# define ISA AVXI +# define ISA_STR "AVXI" +#elif defined(__AVX__) +# define isa avx +# define ISA AVX +# define ISA_STR "AVX" +#elif defined (__SSE4_2__) +# define isa sse42 +# define ISA SSE42 +# define ISA_STR "SSE4.2" +//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 +//# define isa sse41 +//# define ISA SSE41 +//# define ISA_STR "SSE4.1" +//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC +//# define isa ssse3 +//# define ISA SSSE3 +//# define ISA_STR "SSSE3" +//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang +//# define isa sse3 +//# define ISA SSE3 +//# define ISA_STR "SSE3" +#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) +# define isa sse2 +# define ISA SSE2 +# define ISA_STR "SSE2" +#elif defined(__SSE__) +# define isa sse +# define ISA SSE +# define ISA_STR "SSE" +#elif defined(__ARM_NEON) +// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. +#define isa sse2 +#define ISA NEON +#define ISA_STR "NEON" +#else +#error Unknown ISA +#endif + +namespace embree +{ + enum class CPU + { + XEON_ICE_LAKE, + CORE_ICE_LAKE, + CORE_TIGER_LAKE, + CORE_COMET_LAKE, + CORE_CANNON_LAKE, + CORE_KABY_LAKE, + XEON_SKY_LAKE, + CORE_SKY_LAKE, + XEON_PHI_KNIGHTS_MILL, + XEON_PHI_KNIGHTS_LANDING, + XEON_BROADWELL, + CORE_BROADWELL, + XEON_HASWELL, + CORE_HASWELL, + XEON_IVY_BRIDGE, + CORE_IVY_BRIDGE, + SANDY_BRIDGE, + NEHALEM, + CORE2, + CORE1, + ARM, + UNKNOWN, + }; + + /*! get the full path to the running executable */ + std::string getExecutableFileName(); + + /*! return platform name */ + std::string getPlatformName(); + + /*! get the full name of the compiler */ + std::string getCompilerName(); + + /*! return the name of the CPU */ + std::string getCPUVendor(); + + /*! get microprocessor model */ + CPU getCPUModel(); + + /*! converts CPU model into string */ + std::string stringOfCPUModel(CPU model); + + /*! CPU features */ + static const int CPU_FEATURE_SSE = 1 << 0; + static const int CPU_FEATURE_SSE2 = 1 << 1; + static const int CPU_FEATURE_SSE3 = 1 << 2; + static const int CPU_FEATURE_SSSE3 = 1 << 3; + static const int CPU_FEATURE_SSE41 = 1 << 4; + static const int CPU_FEATURE_SSE42 = 1 << 5; + static const int CPU_FEATURE_POPCNT = 1 << 6; + static const int CPU_FEATURE_AVX = 1 << 7; + static const int CPU_FEATURE_F16C = 1 << 8; + static const int CPU_FEATURE_RDRAND = 1 << 9; + static const int CPU_FEATURE_AVX2 = 1 << 10; + static const int CPU_FEATURE_FMA3 = 1 << 11; + static const int CPU_FEATURE_LZCNT = 1 << 12; + static const int CPU_FEATURE_BMI1 = 1 << 13; + static const int CPU_FEATURE_BMI2 = 1 << 14; + static const int CPU_FEATURE_AVX512F = 1 << 16; + static const int CPU_FEATURE_AVX512DQ = 1 << 17; + static const int CPU_FEATURE_AVX512PF = 1 << 18; + static const int CPU_FEATURE_AVX512ER = 1 << 19; + static const int CPU_FEATURE_AVX512CD = 1 << 20; + static const int CPU_FEATURE_AVX512BW = 1 << 21; + static const int CPU_FEATURE_AVX512VL = 1 << 22; + static const int CPU_FEATURE_AVX512IFMA = 1 << 23; + static const int CPU_FEATURE_AVX512VBMI = 1 << 24; + static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; + static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; + static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; + static const int CPU_FEATURE_NEON = 1 << 28; + static const int CPU_FEATURE_NEON_2X = 1 << 29; + + /*! get CPU features */ + int getCPUFeatures(); + + /*! convert CPU features into a string */ + std::string stringOfCPUFeatures(int features); + + /*! creates a string of all supported targets that are supported */ + std::string supportedTargetList (int isa); + + /*! ISAs */ + static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; + static const int SSE2 = SSE | CPU_FEATURE_SSE2; + static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; + static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; + static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; + static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; + static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; + static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; + static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; + static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; + static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; + static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; + + /*! converts ISA bitvector into a string */ + std::string stringOfISA(int features); + + /*! return the number of logical threads of the system */ + unsigned int getNumberOfLogicalThreads(); + + /*! returns the size of the terminal window in characters */ + int getTerminalWidth(); + + /*! returns performance counter in seconds */ + double getSeconds(); + + /*! sleeps the specified number of seconds */ + void sleepSeconds(double t); + + /*! returns virtual address space occupied by process */ + size_t getVirtualMemoryBytes(); + + /*! returns resident memory required by process */ + size_t getResidentMemoryBytes(); +} diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp new file mode 100644 index 0000000000..f9ea5b7d96 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.cpp @@ -0,0 +1,429 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "thread.h" +#include "sysinfo.h" +#include "string.h" + +#include +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#else +#include +#endif + +#if defined(PTHREADS_WIN32) +#pragma comment (lib, "pthreadVC.lib") +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + /*! set the affinity of a given thread */ + void setAffinity(HANDLE thread, ssize_t affinity) + { + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); + typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); + SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); + SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0, group = 0, number = 0; + for (int i = 0; i affinity) { + group = i; + number = (int)affinity - totalProcessors; + break; + } + totalProcessors += processors; + } + + GROUP_AFFINITY groupAffinity; + groupAffinity.Group = (WORD)group; + groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) + WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning + + PROCESSOR_NUMBER processorNumber; + processorNumber.Group = group; + processorNumber.Number = number; + processorNumber.Reserved = 0; + if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) + WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning + } + else + { + if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) + WARNING("SetThreadAffinityMask failed"); // on purpose only a warning + if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) + WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning + } + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) { + setAffinity(GetCurrentThread(), affinity); + } + + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg) + : f(f), arg(arg) {} + public: + thread_func f; + void* arg; + }; + + DWORD WINAPI threadStartup(LPVOID ptr) + { + ThreadStartupData* parg = (ThreadStartupData*) ptr; + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + parg->f(parg->arg); + delete parg; + parg = nullptr; + return 0; + } + +#if !defined(PTHREADS_WIN32) + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); + if (thread == nullptr) FATAL("CreateThread failed"); + if (threadID >= 0) setAffinity(thread, threadID); + return thread_t(thread); + } + + /*! the thread calling this function gets yielded */ + void yield() { + SwitchToThread(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + WaitForSingleObject(HANDLE(tid), INFINITE); + CloseHandle(HANDLE(tid)); + } + + /*! creates thread local storage */ + tls_t createTls() { + return tls_t(size_t(TlsAlloc())); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) { + TlsSetValue(DWORD(size_t(tls)), ptr); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) { + return TlsGetValue(DWORD(size_t(tls))); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) { + TlsFree(DWORD(size_t(tls))); + } +#endif +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include +#include +#include + +#if defined(__ANDROID__) +#include +#endif + +namespace embree +{ + static MutexSys mutex; + static std::vector threadIDs; + +#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target + /* changes thread ID mapping such that we first fill up all thread on one core */ + size_t mapThreadID(size_t threadID) + { + Lock lock(mutex); + + if (threadIDs.size() == 0) + { + /* parse thread/CPU topology */ + for (size_t cpuID=0;;cpuID++) + { + std::fstream fs; + std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); + fs.open (cpu.c_str(), std::fstream::in); + if (fs.fail()) break; + + int i; + while (fs >> i) + { + if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) + threadIDs.push_back(i); + if (fs.peek() == ',') + fs.ignore(); + } + fs.close(); + } + +#if 0 + for (size_t i=0;i " << threadIDs[i] << std::endl; +#endif + + /* verify the mapping and do not use it if the mapping has errors */ + for (size_t i=0;i + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// MacOSX Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include +#include +#include + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + thread_affinity_policy ap; + ap.affinity_tag = affinity; + if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) + WARNING("setting thread affinity failed"); // on purpose only a warning + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) + +#include +#include + +#if defined(__USE_NUMA__) +#include +#endif + +namespace embree +{ + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg, int affinity) + : f(f), arg(arg), affinity(affinity) {} + public: + thread_func f; + void* arg; + ssize_t affinity; + }; + + static void* threadStartup(ThreadStartupData* parg) + { + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + + /*! Mac OS X does not support setting affinity at thread creation time */ +#if defined(__MACOSX__) + if (parg->affinity >= 0) + setAffinity(parg->affinity); +#endif + + parg->f(parg->arg); + delete parg; + parg = nullptr; + return nullptr; + } + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + /* set stack size */ + pthread_attr_t attr; + pthread_attr_init(&attr); + if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); + + /* create thread */ + pthread_t* tid = new pthread_t; + if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { + pthread_attr_destroy(&attr); + delete tid; + FATAL("pthread_create failed"); + } + pthread_attr_destroy(&attr); + + /* set affinity */ +#if defined(__LINUX__) && !defined(__ANDROID__) + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + threadID = mapThreadID(threadID); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#elif defined(__FreeBSD__) + if (threadID >= 0) { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#endif + + return thread_t(tid); + } + + /*! the thread calling this function gets yielded */ + void yield() { + sched_yield(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + if (pthread_join(*(pthread_t*)tid, nullptr) != 0) + FATAL("pthread_join failed"); + delete (pthread_t*)tid; + } + + /*! creates thread local storage */ + tls_t createTls() + { + pthread_key_t* key = new pthread_key_t; + if (pthread_key_create(key,nullptr) != 0) { + delete key; + FATAL("pthread_key_create failed"); + } + + return tls_t(key); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) + { + assert(tls); + return pthread_getspecific(*(pthread_key_t*)tls); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) + { + assert(tls); + if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) + FATAL("pthread_setspecific failed"); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) + { + assert(tls); + if (pthread_key_delete(*(pthread_key_t*)tls) != 0) + FATAL("pthread_key_delete failed"); + delete (pthread_key_t*)tls; + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h new file mode 100644 index 0000000000..45da6e6a70 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.h @@ -0,0 +1,46 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "mutex.h" +#include "alloc.h" +#include "vector.h" +#include + +namespace embree +{ + /*! type for thread */ + typedef struct opaque_thread_t* thread_t; + + /*! signature of thread start function */ + typedef void (*thread_func)(void*); + + /*! creates a hardware thread running on specific logical thread */ + thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity); + + /*! the thread calling this function gets yielded */ + void yield(); + + /*! waits until the given thread has terminated */ + void join(thread_t tid); + + /*! type for handle to thread local storage */ + typedef struct opaque_tls_t* tls_t; + + /*! creates thread local storage */ + tls_t createTls(); + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr); + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls); + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls); +} diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h new file mode 100644 index 0000000000..e41794de7c --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/vector.h @@ -0,0 +1,242 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "alloc.h" +#include + +namespace embree +{ + template + class vector_t + { + public: + typedef T value_type; + typedef T* iterator; + typedef const T* const_iterator; + + __forceinline vector_t () + : size_active(0), size_alloced(0), items(nullptr) {} + + __forceinline explicit vector_t (size_t sz) + : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + template + __forceinline explicit vector_t (M alloc, size_t sz) + : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline ~vector_t() { + clear(); + } + + __forceinline vector_t (const vector_t& other) + { + size_active = other.size_active; + size_alloced = other.size_alloced; + items = alloc.allocate(size_alloced); + for (size_t i=0; i 0); return items[0]; }; + __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& nt) + { + const T v = nt; // need local copy as input reference could point to this vector + internal_resize(size_active,internal_grow_size(size_active+1)); + ::new (&items[size_active++]) T(v); + } + + __forceinline void pop_back() + { + assert(!empty()); + size_active--; + alloc.destroy(&items[size_active]); + } + + __forceinline void clear() + { + /* destroy elements */ + for (size_t i=0; i + using vector = vector_t>; + + /*! vector class that performs aligned allocations */ + template + using avector = vector_t::value> >; + + /*! vector class that performs OS allocations */ + template + using ovector = vector_t >; +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h new file mode 100644 index 0000000000..9940e068d0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h @@ -0,0 +1,17 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#if defined(TASKING_INTERNAL) +# include "taskschedulerinternal.h" +#elif defined(TASKING_GCD) && defined(BUILD_IOS) +# include "taskschedulergcd.h" +#elif defined(TASKING_TBB) +# include "taskschedulertbb.h" +#elif defined(TASKING_PPL) +# include "taskschedulerppl.h" +#else +# error "no tasking system enabled" +#endif + diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h new file mode 100644 index 0000000000..d31f8bb478 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h @@ -0,0 +1,49 @@ +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#include + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy() {} + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { + currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads; + return currentThreadIndex; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() + { + return GCDNumThreads; + } + + private: + static size_t GCDNumThreads; + static size_t currentThreadIndex; + + }; + +}; + diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp new file mode 100644 index 0000000000..ebf656d1a0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp @@ -0,0 +1,426 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "taskschedulerinternal.h" +#include "../math/math.h" +#include "../sys/sysinfo.h" +#include + +namespace embree +{ + RTC_NAMESPACE_BEGIN + + static MutexSys g_mutex; + size_t TaskScheduler::g_numThreads = 0; + __thread TaskScheduler* TaskScheduler::g_instance = nullptr; + std::vector> g_instance_vector; + __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; + TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; + + template + __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) + { + while (true) + { + /*! some rounds that yield */ + for (size_t i=0; i<32; i++) + { + /*! some spinning rounds */ + const size_t threadCount = thread.threadCount(); + for (size_t j=0; j<1024; j+=threadCount) + { + if (!pred()) return; + if (thread.scheduler->steal_from_other_threads(thread)) { + i=j=0; + body(); + } + } + yield(); + } + } + } + + /*! run this task */ + void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible + { + /* try to run if not already stolen */ + if (try_switch_state(INITIALIZED,DONE)) + { + Task* prevTask = thread.task; + thread.task = this; + // -- GODOT start -- + // try { + // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); + // } catch (...) { + // if (thread.scheduler->cancellingException == nullptr) + // thread.scheduler->cancellingException = std::current_exception(); + // } + // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } + + /* steal until all dependencies have completed */ + steal_loop(thread, + [&] () { return dependencies>0; }, + [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); + + /* now signal our parent task that we are finished */ + if (parent) + parent->add_dependencies(-1); + } + + /*! run this task */ + dll_export void TaskScheduler::Task::run (Thread& thread) { + run_internal(thread); + } + + bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) + { + /* stop if we run out of local tasks or reach the waiting task */ + if (right == 0 || &tasks[right-1] == parent) + return false; + + /* execute task */ + size_t oldRight = right; + tasks[right-1].run_internal(thread); + if (right != oldRight) { + THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); + } + + /* pop task and closure from stack */ + right--; + if (tasks[right].stackPtr != size_t(-1)) + stackPtr = tasks[right].stackPtr; + + /* also move left pointer */ + if (left >= right) left.store(right.load()); + + return right != 0; + } + + dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { + return execute_local_internal(thread,parent); + } + + bool TaskScheduler::TaskQueue::steal(Thread& thread) + { + size_t l = left; + size_t r = right; + if (l < r) + { + l = left++; + if (l >= r) + return false; + } + else + return false; + + if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) + return false; + + thread.tasks.right++; + return true; + } + + /* we steal from the left */ + size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() + { + if (left >= right) return 0; + return tasks[left].N; + } + + void threadPoolFunction(std::pair* pair) + { + TaskScheduler::ThreadPool* pool = pair->first; + size_t threadIndex = pair->second; + delete pair; + pool->thread_loop(threadIndex); + } + + TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) + : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} + + dll_export void TaskScheduler::ThreadPool::startThreads() + { + if (running) return; + setNumThreads(numThreads,true); + } + + void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) + { + Lock lock(g_mutex); + assert(newNumThreads); + newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); + + // We are observing a few % gain by increasing number threads by 2 on aarch64. +#if defined(__aarch64__) && defined(BUILD_IOS) + numThreads = newNumThreads*2; +#else + numThreads = newNumThreads; +#endif + numThreads = newNumThreads; + if (!startThreads && !running) return; + running = true; + size_t numThreadsActive = numThreadsRunning; + + mutex.lock(); + numThreadsRunning = newNumThreads; + mutex.unlock(); + condition.notify_all(); + + /* start new threads */ + for (size_t t=numThreadsActive; t(this,t); + threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); + } + + /* stop some threads if we reduce the number of threads */ + for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { + if (t == 0) continue; + embree::join(threads.back()); + threads.pop_back(); + } + } + + TaskScheduler::ThreadPool::~ThreadPool() + { + /* leave all taskschedulers */ + mutex.lock(); + numThreadsRunning = 0; + mutex.unlock(); + condition.notify_all(); + + /* wait for threads to terminate */ + for (size_t i=0; i& scheduler) + { + mutex.lock(); + schedulers.push_back(scheduler); + mutex.unlock(); + condition.notify_all(); + } + + dll_export void TaskScheduler::ThreadPool::remove(const Ref& scheduler) + { + Lock lock(mutex); + for (std::list >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { + if (scheduler == *it) { + schedulers.erase(it); + return; + } + } + } + + void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) + { + while (globalThreadIndex < numThreadsRunning) + { + Ref scheduler = NULL; + ssize_t threadIndex = -1; + { + Lock lock(mutex); + condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); + if (globalThreadIndex >= numThreadsRunning) break; + scheduler = schedulers.front(); + threadIndex = scheduler->allocThreadIndex(); + } + scheduler->thread_loop(threadIndex); + } + } + + TaskScheduler::TaskScheduler() + : threadCounter(0), anyTasksRunning(0), hasRootTask(false) + { + threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. + for (size_t i=0; ithreadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadIndex() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadCount() { + return threadPool->size(); + } + + dll_export TaskScheduler* TaskScheduler::instance() + { + if (g_instance == NULL) { + Lock lock(g_mutex); + g_instance = new TaskScheduler; + g_instance_vector.push_back(g_instance); + } + return g_instance; + } + + void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) + { + if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); + threadPool->setNumThreads(numThreads,start_threads); + } + + void TaskScheduler::destroy() { + delete threadPool; threadPool = nullptr; + } + + dll_export ssize_t TaskScheduler::allocThreadIndex() + { + size_t threadIndex = threadCounter++; + assert(threadIndex < threadLocal.size()); + return threadIndex; + } + + void TaskScheduler::join() + { + mutex.lock(); + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); + // -- GODOT start -- + // std::exception_ptr except = thread_loop(threadIndex); + // if (except != nullptr) std::rethrow_exception(except); + thread_loop(threadIndex); + // -- GODOT end -- + } + + void TaskScheduler::reset() { + hasRootTask = false; + } + + void TaskScheduler::wait_for_threads(size_t threadCount) + { + while (threadCounter < threadCount-1) + pause_cpu(); + } + + dll_export TaskScheduler::Thread* TaskScheduler::thread() { + return thread_local_thread; + } + + dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) + { + Thread* old = thread_local_thread; + thread_local_thread = thread; + return old; + } + + dll_export bool TaskScheduler::wait() + { + Thread* thread = TaskScheduler::thread(); + if (thread == nullptr) return true; + while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; + return thread->scheduler->cancellingException == nullptr; + } + +// -- GODOT start -- +// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) + void TaskScheduler::thread_loop(size_t threadIndex) +// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + threadLocal[threadIndex].store(&thread); + Thread* oldThread = swapThread(&thread); + + /* main thread loop */ + while (anyTasksRunning) + { + steal_loop(thread, + [&] () { return anyTasksRunning > 0; }, + [&] () { + anyTasksRunning++; + while (thread.tasks.execute_local_internal(thread,nullptr)); + anyTasksRunning--; + }); + } + threadLocal[threadIndex].store(nullptr); + swapThread(oldThread); + + /* remember exception to throw */ + // -- GODOT start -- + // std::exception_ptr except = nullptr; + // if (cancellingException != nullptr) except = cancellingException; + // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; +#if defined(__WIN32__) + size_t loopIndex = 1; +#endif +#define LOOP_YIELD_THRESHOLD (4096) + while (threadCounter > 0) { +#if defined(__WIN32__) + if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) + yield(); + else + _mm_pause(); + loopIndex++; +#else + yield(); +#endif + } + // -- GODOT start -- + // return except; + return; + // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) + { + const size_t threadIndex = thread.threadIndex; + const size_t threadCount = this->threadCounter; + + for (size_t i=1; i= threadCount) otherThreadIndex -= threadCount; + + Thread* othread = threadLocal[otherThreadIndex].load(); + if (!othread) + continue; + + if (othread->tasks.steal(thread)) + return true; + } + + return false; + } + + dll_export void TaskScheduler::startThreads() { + threadPool->startThreads(); + } + + dll_export void TaskScheduler::addScheduler(const Ref& scheduler) { + threadPool->add(scheduler); + } + + dll_export void TaskScheduler::removeScheduler(const Ref& scheduler) { + threadPool->remove(scheduler); + } + + RTC_NAMESPACE_END +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h new file mode 100644 index 0000000000..8bd70b2b8c --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h @@ -0,0 +1,386 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" +#include "../sys/atomic.h" +#include "../math/range.h" +#include "../../include/embree3/rtcore.h" + +#include + +namespace embree +{ + + /* The tasking system exports some symbols to be used by the tutorials. Thus we + hide is also in the API namespace when requested. */ + RTC_NAMESPACE_BEGIN + + struct TaskScheduler : public RefCount + { + ALIGNED_STRUCT_(64); + friend class Device; + + static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack + static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures + + struct Thread; + + /*! virtual interface for all tasks */ + struct TaskFunction { + virtual void execute() = 0; + }; + + /*! builds a task interface from a closure */ + template + struct ClosureTaskFunction : public TaskFunction + { + Closure closure; + __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} + void execute() { closure(); }; + }; + + struct __aligned(64) Task + { + /*! states a task can be in */ + enum { DONE, INITIALIZED }; + + /*! switch from one state to another */ + __forceinline void switch_state(int from, int to) + { + __memory_barrier(); + MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); + assert(success); + } + + /*! try to switch from one state to another */ + __forceinline bool try_switch_state(int from, int to) { + __memory_barrier(); + return state.compare_exchange_strong(from,to); + } + + /*! increment/decrement dependency counter */ + void add_dependencies(int n) { + dependencies+=n; + } + + /*! initialize all tasks to DONE state by default */ + __forceinline Task() + : state(DONE) {} + + /*! construction of new task */ + __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) + : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) + { + if (parent) parent->add_dependencies(+1); + switch_state(DONE,INITIALIZED); + } + + /*! construction of stolen task, stealing thread will decrement initial dependency */ + __forceinline Task (TaskFunction* closure, Task* parent) + : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) + { + switch_state(DONE,INITIALIZED); + } + + /*! try to steal this task */ + bool try_steal(Task& child) + { + if (!stealable) return false; + if (!try_switch_state(INITIALIZED,DONE)) return false; + new (&child) Task(closure, this); + return true; + } + + /*! run this task */ + dll_export void run(Thread& thread); + + void run_internal(Thread& thread); + + public: + std::atomic state; //!< state this task is in + std::atomic dependencies; //!< dependencies to wait for + std::atomic stealable; //!< true if task can be stolen + TaskFunction* closure; //!< the closure to execute + Task* parent; //!< parent task to signal when we are finished + size_t stackPtr; //!< stack location where closure is stored + size_t N; //!< approximative size of task + }; + + struct TaskQueue + { + TaskQueue () + : left(0), right(0), stackPtr(0) {} + + __forceinline void* alloc(size_t bytes, size_t align = 64) + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("closure stack overflow"); + abort(); + // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } + + template + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("task stack overflow"); + abort(); + // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; + TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction))) ClosureTaskFunction(closure); + /* gcc 8 or later fails to compile without explicit .load() */ + new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size); + right++; + + /* also move left pointer */ + if (left >= right-1) left = right-1; + } + + dll_export bool execute_local(Thread& thread, Task* parent); + bool execute_local_internal(Thread& thread, Task* parent); + bool steal(Thread& thread); + size_t getTaskSizeAtLeft(); + + bool empty() { return right == 0; } + + public: + + /* task stack */ + Task tasks[TASK_STACK_SIZE]; + __aligned(64) std::atomic left; //!< threads steal from left + __aligned(64) std::atomic right; //!< new tasks are added to the right + + /* closure stack */ + __aligned(64) char stack[CLOSURE_STACK_SIZE]; + size_t stackPtr; + }; + + /*! thread local structure for each thread */ + struct Thread + { + ALIGNED_STRUCT_(64); + + Thread (size_t threadIndex, const Ref& scheduler) + : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} + + __forceinline size_t threadCount() { + return scheduler->threadCounter; + } + + size_t threadIndex; //!< ID of this thread + TaskQueue tasks; //!< local task queue + Task* task; //!< current active task + Ref scheduler; //!< pointer to task scheduler + }; + + /*! pool of worker threads */ + struct ThreadPool + { + ThreadPool (bool set_affinity); + ~ThreadPool (); + + /*! starts the threads */ + dll_export void startThreads(); + + /*! sets number of threads to use */ + void setNumThreads(size_t numThreads, bool startThreads = false); + + /*! adds a task scheduler object for scheduling */ + dll_export void add(const Ref& scheduler); + + /*! remove the task scheduler object again */ + dll_export void remove(const Ref& scheduler); + + /*! returns number of threads of the thread pool */ + size_t size() const { return numThreads; } + + /*! main loop for all threads */ + void thread_loop(size_t threadIndex); + + private: + std::atomic numThreads; + std::atomic numThreadsRunning; + bool set_affinity; + std::atomic running; + std::vector threads; + + private: + MutexSys mutex; + ConditionSys condition; + std::list > schedulers; + }; + + TaskScheduler (); + ~TaskScheduler (); + + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /*! lets new worker threads join the tasking system */ + void join(); + void reset(); + + /*! let a worker thread allocate a thread index */ + dll_export ssize_t allocThreadIndex(); + + /*! wait for some number of threads available (threadCount includes main thread) */ + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ + // -- GODOT start -- + // std::exception_ptr thread_loop(size_t threadIndex); + void thread_loop(size_t threadIndex); + // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); + + template + static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); + + /* spawn a new task at the top of the threads task stack */ + template + void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) + { + if (useThreadPool) startThreads(); + + size_t threadIndex = allocThreadIndex(); + std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + assert(threadLocal[threadIndex].load() == nullptr); + threadLocal[threadIndex] = &thread; + Thread* oldThread = swapThread(&thread); + thread.tasks.push_right(thread,size,closure); + { + Lock lock(mutex); + anyTasksRunning++; + hasRootTask = true; + condition.notify_all(); + } + + if (useThreadPool) addScheduler(this); + + while (thread.tasks.execute_local(thread,nullptr)); + anyTasksRunning--; + if (useThreadPool) removeScheduler(this); + + threadLocal[threadIndex] = nullptr; + swapThread(oldThread); + + /* remember exception to throw */ + std::exception_ptr except = nullptr; + if (cancellingException != nullptr) except = cancellingException; + + /* wait for all threads to terminate */ + threadCounter--; + while (threadCounter > 0) yield(); + cancellingException = nullptr; + + /* re-throw proper exception */ + if (except != nullptr) + std::rethrow_exception(except); + } + + /* spawn a new task at the top of the threads task stack */ + template + static __forceinline void spawn(size_t size, const Closure& closure) + { + Thread* thread = TaskScheduler::thread(); + if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); + else instance()->spawn_root(closure,size); + } + + /* spawn a new task at the top of the threads task stack */ + template + static __forceinline void spawn(const Closure& closure) { + spawn(1,closure); + } + + /* spawn a new task set */ + template + static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) + { + spawn(end-begin, [=]() + { + if (end-begin <= blockSize) { + return closure(range(begin,end)); + } + const Index center = (begin+end)/2; + spawn(begin,center,blockSize,closure); + spawn(center,end ,blockSize,closure); + wait(); + }); + } + + /* work on spawned subtasks and wait until all have finished */ + dll_export static bool wait(); + + /* returns the ID of the current thread */ + dll_export static size_t threadID(); + + /* returns the index (0..threadCount-1) of the current thread */ + dll_export static size_t threadIndex(); + + /* returns the total number of threads */ + dll_export static size_t threadCount(); + + private: + + /* returns the thread local task list of this worker thread */ + dll_export static Thread* thread(); + + /* sets the thread local task list of this worker thread */ + dll_export static Thread* swapThread(Thread* thread); + + /*! returns the taskscheduler object to be used by the master thread */ + dll_export static TaskScheduler* instance(); + + /*! starts the threads */ + dll_export static void startThreads(); + + /*! adds a task scheduler object for scheduling */ + dll_export static void addScheduler(const Ref& scheduler); + + /*! remove the task scheduler object again */ + dll_export static void removeScheduler(const Ref& scheduler); + + private: + std::vector> threadLocal; + std::atomic threadCounter; + std::atomic anyTasksRunning; + std::atomic hasRootTask; + std::exception_ptr cancellingException; + MutexSys mutex; + ConditionSys condition; + + private: + static size_t g_numThreads; + static __thread TaskScheduler* g_instance; + static __thread Thread* thread_local_thread; + static ThreadPool* threadPool; + }; + + RTC_NAMESPACE_END + +#if defined(RTC_NAMESPACE) + using RTC_NAMESPACE::TaskScheduler; +#endif +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h new file mode 100644 index 0000000000..776f98cdac --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h @@ -0,0 +1,46 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if !defined(__WIN32__) +#error PPL tasking system only available under windows +#endif + +#include + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() { + return GetCurrentThreadId(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + /* FIXME: threadIndex is NOT supported by PPL! */ + static __forceinline size_t threadIndex() { + return 0; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { + return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; + } + }; +}; diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h new file mode 100644 index 0000000000..98dba26871 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h @@ -0,0 +1,67 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if defined(__WIN32__) +# define NOMINMAX +#endif + +// We need to define these to avoid implicit linkage against +// tbb_debug.lib under Windows. When removing these lines debug build +// under Windows fails. +#define __TBB_NO_IMPLICIT_LINKAGE 1 +#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 +#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 +#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 +#include "tbb/tbb.h" +#include "tbb/parallel_sort.h" + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::current_thread_index(); +#elif TBB_INTERFACE_VERSION >= 9000 + return tbb::task_arena::current_thread_index(); +#else + return 0; +#endif + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::max_concurrency(); +#else + return tbb::task_scheduler_init::default_num_threads(); +#endif + } + + }; + +}; -- cgit v1.2.3