diff options
author | RĂ©mi Verschelde <remi@verschelde.fr> | 2021-05-21 18:30:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-21 18:30:02 +0200 |
commit | 3ee034451a9349e7de26decc662afefd7ab8c460 (patch) | |
tree | a8bec3fbb06c2eaca05a075f5ffe2cdd2d94f04a /thirdparty/embree-aarch64/common | |
parent | 8fa07eae145e1e37eb8708ce8c117188b58e3ecc (diff) | |
parent | 767e374dced69b45db0afb30ca2ccf0bbbeef672 (diff) |
Merge pull request #48885 from JFonS/upgrade_embree
Upgrade Embree to the latest official release (3.13.0).
Diffstat (limited to 'thirdparty/embree-aarch64/common')
114 files changed, 0 insertions, 26738 deletions
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h deleted file mode 100644 index 01f1f80f6c..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include <functional> -#include "parallel_reduce.h" - -namespace embree -{ - - template<typename Index, class UnaryPredicate> - __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) - { - bool ret = false; - -#if defined(TASKING_TBB) -#if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) { - if (context.is_group_execution_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - context.cancel_group_execution(); - } - } - }); -#else - tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) { - if (tbb::task::self().is_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - tbb::task::self().cancel_group_execution(); - } - } - }); -#endif -#else - ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool { - bool localret = false; - for (auto i=r.begin(); i<r.end(); ++i) { - localret |= pred(i); - } - return localret; - }, - std::bit_or<bool>() - ); -#endif - - return ret; - } - -} // end namespace diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp deleted file mode 100644 index acddc0ff81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_filter.h" -#include "../sys/regression.h" -#include <map> - -namespace embree -{ - struct parallel_filter_regression_test : public RegressionTest - { - parallel_filter_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; }; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - size_t N0 = rand() % N; - - /* initialize array with random numbers */ - std::vector<uint32_t> src(N); - std::map<uint32_t,int> m; - for (size_t i=0; i<N; i++) src[i] = rand(); - - /* count elements up */ - for (size_t i=N0; i<N; i++) - if (pred(src[i])) - m[src[i]] = 0; - for (size_t i=N0; i<N; i++) - if (pred(src[i])) - m[src[i]]++; - - /* filter array */ - //size_t M = sequential_filter(src.data(),N0,N,pred); - size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred); - - /* check if filtered data is correct */ - for (size_t i=N0; i<M; i++) { - passed &= pred(src[i]); - m[src[i]]--; - } - for (size_t i=N0; i<M; i++) - passed &= (m[src[i]] == 0); - } - - return passed; - } - }; - - parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h deleted file mode 100644 index 5823fc631f..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Ty, typename Index, typename Predicate> - inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) - { - Index j = first; - for (Index i=first; i<last; i++) - if (predicate(data[i])) - data[j++] = data[i]; - - return j; - } - - template<typename Ty, typename Index, typename Predicate> - inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) - { - /* sequential fallback */ - if (end-begin <= minStepSize) - return sequential_filter(data,begin,end,predicate); - - /* calculate number of tasks to use */ - enum { MAX_TASKS = 64 }; - const Index numThreads = TaskScheduler::threadCount(); - const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; - const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); - - /* filter blocks */ - Index nused[MAX_TASKS]; - Index nfree[MAX_TASKS]; - parallel_for(taskCount, [&](const Index taskIndex) - { - const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; - const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; - const Index i2 = sequential_filter(data,i0,i1,predicate); - nused[taskIndex] = i2-i0; - nfree[taskIndex] = i1-i2; - }); - - /* calculate offsets */ - Index sused=0; - Index sfree=0; - Index pfree[MAX_TASKS]; - for (Index i=0; i<taskCount; i++) - { - sused+=nused[i]; - Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree; - } - - /* return if we did not filter out any element */ - assert(sfree <= end-begin); - assert(sused <= end-begin); - if (sused == end-begin) - return end; - - /* otherwise we have to copy misplaced elements around */ - parallel_for(taskCount, [&](const Index taskIndex) - { - /* destination to write elements to */ - Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex]; - Index dst_end = min(dst+nfree[taskIndex],begin+sused); - if (dst_end <= dst) return; - - /* range of misplaced elements to copy to destination */ - Index r0 = pfree[taskIndex]; - Index r1 = r0+dst_end-dst; - - /* find range in misplaced elements in back to front order */ - Index k0=0; - for (Index i=taskCount-1; i>0; i--) - { - if (k0 > r1) break; - Index k1 = k0+nused[i]; - Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; - for (Index i=max(r0,k0); i<min(r1,k1); i++) { - Index isrc = src-i+k0-1; - assert(dst >= begin && dst < end); - assert(isrc >= begin && isrc < end); - data[dst++] = data[isrc]; - } - k0 = k1; - } - }); - - return begin+sused; - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp deleted file mode 100644 index ef070ebc4d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_regression_test : public RegressionTest - { - parallel_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - const size_t M = 10; - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* sequentially calculate sum of squares */ - size_t sum0 = 0; - for (size_t i=0; i<N; i++) { - sum0 += i*i; - } - - /* parallel calculation of sum of squares */ - for (size_t m=0; m<M; m++) - { - std::atomic<size_t> sum1(0); - parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) - s += i*i; - sum1 += s; - }); - passed = sum0 == sum1; - } - } - - return passed; - } - }; - - parallel_for_regression_test parallel_for_regression("parallel_for_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h deleted file mode 100644 index 51d296fb16..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../tasking/taskscheduler.h" -#include "../sys/array.h" -#include "../math/math.h" -#include "../math/range.h" - -#if defined(TASKING_GCD) && defined(BUILD_IOS) -#include <dispatch/dispatch.h> -#include <algorithm> -#include <type_traits> -#endif - -namespace embree -{ - /* parallel_for without range */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index N, const Func& func) - { -#if defined(TASKING_INTERNAL) - if (N) { - TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) { - assert(r.size() == 1); - func(r.begin()); - }); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - } -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1; - const size_t length = N; - const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks; - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = (currentBlock * blockSize); - const size_t blockLength = std::min(length - start, blockSize); - const size_t end = start + blockLength; - - for(size_t i=start; i < end; i++) - { - func(i); - } - }); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range and granulatity */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) - { - assert(first <= last); -#if defined(TASKING_INTERNAL) - TaskScheduler::spawn(first,last,minStepSize,func); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1; - const size_t length = last - first; - const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks; - size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads); - blockSize += blockSize % 4; - - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = first + (currentBlock * blockSize); - const size_t end = std::min<size_t>(last, start + blockSize); - - func( embree::range<Index>(start,end) ); - }); - - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { - func(range<Index>(r.begin(),r.end())); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { - func(range<Index>(r.begin(),r.end())); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { - func(range<Index>(i,i+1)); - }); - -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index first, const Index last, const Func& func) - { - assert(first <= last); - parallel_for(first,last,(Index)1,func); - } - -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) - - template<typename Index, typename Func> - __forceinline void parallel_for_static( const Index N, const Func& func) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner(),context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner()); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - - typedef tbb::affinity_partitioner affinity_partitioner; - - template<typename Index, typename Func> - __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap,context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - -#else - - template<typename Index, typename Func> - __forceinline void parallel_for_static( const Index N, const Func& func) - { - parallel_for(N,func); - } - - struct affinity_partitioner { - }; - - template<typename Index, typename Func> - __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) - { - parallel_for(N,func); - } - -#endif -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp deleted file mode 100644 index 0337611b35..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_regression_test : public RegressionTest - { - parallel_for_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - size_t sum0 = 0; - size_t K = 0; - const size_t M = 1000; - std::vector<std::vector<size_t>* > array2(M); - for (size_t i=0; i<M; i++) { - const size_t N = rand() % 1024; - K+=N; - array2[i] = new std::vector<size_t>(N); - for (size_t j=0; j<N; j++) - sum0 += (*array2[i])[j] = rand(); - } - - /* array to test global index */ - std::vector<atomic<size_t>> verify_k(K); - for (size_t i=0; i<K; i++) verify_k[i].store(0); - - /* add all numbers using parallel_for_for */ - std::atomic<size_t> sum1(0); - parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - s += (*v)[i]; - verify_k[k++]++; - } - sum1 += s; - return sum1; - }); - passed &= (sum0 == sum1); - - /* check global index */ - for (size_t i=0; i<K; i++) - passed &= (verify_k[i] == 1); - - /* delete vectors again */ - for (size_t i=0; i<array2.size(); i++) - delete array2[i]; - - return passed; - } - }; - - parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h deleted file mode 100644 index 852b8a0900..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename ArrayArray, typename Func> - __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - size_t k=0; - for (size_t i=0; i!=array2.size(); ++i) { - const size_t N = array2[i]->size(); - if (N) func(array2[i],range<size_t>(0,N),k); - k+=N; - } - } - - class ParallelForForState - { - public: - - enum { MAX_TASKS = 64 }; - - __forceinline ParallelForForState () - : taskCount(0) {} - - template<typename ArrayArray> - __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { - init(array2,minStepSize); - } - - template<typename ArrayArray> - __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) - { - /* first calculate total number of elements */ - size_t N = 0; - for (size_t i=0; i<array2.size(); i++) { - N += array2[i] ? array2[i]->size() : 0; - } - this->N = N; - - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (N+minStepSize-1)/minStepSize; - taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); - - /* calculate start (i,j) for each task */ - size_t taskIndex = 0; - i0[taskIndex] = 0; - j0[taskIndex] = 0; - size_t k0 = (++taskIndex)*N/taskCount; - for (size_t i=0, k=0; taskIndex < taskCount; i++) - { - assert(i<array2.size()); - size_t j=0, M = array2[i] ? array2[i]->size() : 0; - while (j<M && k+M-j >= k0 && taskIndex < taskCount) { - assert(taskIndex<taskCount); - i0[taskIndex] = i; - j0[taskIndex] = j += k0-k; - k=k0; - k0 = (++taskIndex)*N/taskCount; - } - k+=M-j; - } - } - - __forceinline size_t size() const { - return N; - } - - public: - size_t i0[MAX_TASKS]; - size_t j0[MAX_TASKS]; - size_t taskCount; - size_t N; - }; - - template<typename ArrayArray, typename Func> - __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - ParallelForForState state(array2,minStepSize); - - parallel_for(state.taskCount, [&](const size_t taskIndex) - { - /* calculate range */ - const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; - const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - for (size_t i=i0; k<k1; i++) { - const size_t N = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); - k+=r1-r0; j0 = 0; - } - }); - } - - template<typename ArrayArray, typename Func> - __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) - { - parallel_for_for(array2,1,func); - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - ParallelForForState state(array2,minStepSize); - Value temp[ParallelForForState::MAX_TASKS]; - - for (size_t i=0; i<state.taskCount; i++) - temp[i] = identity; - - parallel_for(state.taskCount, [&](const size_t taskIndex) - { - /* calculate range */ - const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; - const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - for (size_t i=i0; k<k1; i++) { - const size_t N = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); - k+=r1-r0; j0 = 0; - } - }); - - Value ret = identity; - for (size_t i=0; i<state.taskCount; i++) - ret = reduction(ret,temp[i]); - return ret; - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_reduce(array2,1,identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp deleted file mode 100644 index 0169d8e481..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_prefix_sum_regression_test : public RegressionTest - { - parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t M = 10; - std::vector<atomic<size_t>> flattened; - typedef std::vector<std::vector<size_t>* > ArrayArray; - ArrayArray array2(M); - size_t K = 0; - for (size_t i=0; i<M; i++) { - const size_t N = rand() % 10; - K += N; - array2[i] = new std::vector<size_t>(N); - for (size_t j=0; j<N; j++) - (*array2[i])[j] = rand() % 10; - } - - /* array to test global index */ - std::vector<atomic<size_t>> verify_k(K); - for (size_t i=0; i<K; i++) verify_k[i].store(0); - - ParallelForForPrefixSumState<size_t> state(array2,size_t(1)); - - /* dry run only counts */ - size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - s += (*v)[i]; - verify_k[k++]++; - } - return s; - }, [](size_t v0, size_t v1) { return v0+v1; }); - - /* create properly sized output array */ - flattened.resize(S); - for (auto& a : flattened) a.store(0); - - /* now we actually fill the flattened array */ - parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - for (size_t j=0; j<(*v)[i]; j++) { - flattened[base+s+j]++; - } - s += (*v)[i]; - verify_k[k++]++; - } - return s; - }, [](size_t v0, size_t v1) { return v0+v1; }); - - /* check global index */ - for (size_t i=0; i<K; i++) - passed &= (verify_k[i] == 2); - - /* check if each element was assigned exactly once */ - for (size_t i=0; i<flattened.size(); i++) - passed &= (flattened[i] == 1); - - /* delete arrays again */ - for (size_t i=0; i<array2.size(); i++) - delete array2[i]; - - return passed; - } - }; - - parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h deleted file mode 100644 index d2671d8a6a..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for_for.h" -#include "parallel_prefix_sum.h" - -namespace embree -{ - template<typename Value> - struct ParallelForForPrefixSumState : public ParallelForForState - { - __forceinline ParallelForForPrefixSumState () {} - - template<typename ArrayArray> - __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) - : ParallelForForState(array2,minStepSize) {} - - ParallelPrefixSumState<Value> prefix_state; - }; - - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i)); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.prefix_state.counts[i]; - state.prefix_state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.prefix_state.counts[i]; - state.prefix_state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp deleted file mode 100644 index 09dc303f81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_map.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_map_regression_test : public RegressionTest - { - parallel_map_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create key/value vectors with random numbers */ - const size_t N = 10000; - std::vector<uint32_t> keys(N); - std::vector<uint32_t> vals(N); - for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649; - for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]); - for (size_t i=0; i<N; i++) vals[i] = 2*rand(); - - /* create map */ - parallel_map<uint32_t,uint32_t> map; - map.init(keys,vals); - - /* check that all keys are properly mapped */ - for (size_t i=0; i<N; i++) { - const uint32_t* val = map.lookup(keys[i]); - passed &= val && (*val == vals[i]); - } - - /* check that these keys are not in the map */ - for (size_t i=0; i<N; i++) { - passed &= !map.lookup(keys[i]+1); - } - - return passed; - } - }; - - parallel_map_regression_test parallel_map_regression("parallel_map_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h deleted file mode 100644 index 02e1a8f8d0..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_sort.h" - -namespace embree -{ - /*! implementation of a key/value map with parallel construction */ - template<typename Key, typename Val> - class parallel_map - { - /* key/value pair to build the map */ - struct KeyValue - { - __forceinline KeyValue () {} - - __forceinline KeyValue (const Key key, const Val val) - : key(key), val(val) {} - - __forceinline operator Key() const { - return key; - } - - public: - Key key; - Val val; - }; - - public: - - /*! parallel map constructors */ - parallel_map () {} - - /*! construction from pair of vectors */ - template<typename KeyVector, typename ValVector> - parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } - - /*! initialized the parallel map from a vector with keys and values */ - template<typename KeyVector, typename ValVector> - void init(const KeyVector& keys, const ValVector& values) - { - /* reserve sufficient space for all data */ - assert(keys.size() == values.size()); - vec.resize(keys.size()); - - /* generate key/value pairs */ - parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) { - for (size_t i=r.begin(); i<r.end(); i++) - vec[i] = KeyValue((Key)keys[i],values[i]); - }); - - /* perform parallel radix sort of the key/value pairs */ - std::vector<KeyValue> temp(keys.size()); - radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size()); - } - - /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ - __forceinline const Val* lookup(const Key& key) const - { - typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return nullptr; - if (i->key != key) return nullptr; - return &i->val; - } - - /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ - __forceinline Val lookup(const Key& key, const Val& def) const - { - typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return def; - if (i->key != key) return def; - return i->val; - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector<KeyValue> vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp deleted file mode 100644 index eb20c4465d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_partition.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_partition_regression_test : public RegressionTest - { - parallel_partition_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - for (size_t i=0; i<100; i++) - { - /* create random permutation */ - size_t N = std::rand() % 1000000; - std::vector<unsigned> array(N); - for (unsigned i=0; i<N; i++) array[i] = i; - for (auto& v : array) std::swap(v,array[std::rand()%array.size()]); - size_t split = std::rand() % (N+1); - - /* perform parallel partitioning */ - size_t left_sum = 0, right_sum = 0; - size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum, - [&] ( size_t i ) { return i < split; }, - [] ( size_t& sum, unsigned v) { sum += v; }, - [] ( size_t& sum, size_t v) { sum += v; }, - 128); - - /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum, - [&] ( size_t i ) { return i < split; }, - [] ( size_t& left_sum, int v) { left_sum += v; });*/ - - /* verify result */ - passed &= mid == split; - passed &= left_sum == split*(split-1)/2; - passed &= right_sum == N*(N-1)/2-left_sum; - for (size_t i=0; i<split; i++) passed &= array[i] < split; - for (size_t i=split; i<N; i++) passed &= array[i] >= split; - } - - return passed; - } - }; - - parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h deleted file mode 100644 index 3b3ad7c854..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" -#include "../math/range.h" - -namespace embree -{ - /* serial partitioning */ - template<typename T, typename V, typename IsLeft, typename Reduction_T> - __forceinline size_t serial_partitioning(T* array, - const size_t begin, - const size_t end, - V& leftReduction, - V& rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t) - { - T* l = array + begin; - T* r = array + end - 1; - - while(1) - { - /* *l < pivot */ - while (likely(l <= r && is_left(*l) )) - { - //prefetchw(l+4); // FIXME: enable? - reduction_t(leftReduction,*l); - ++l; - } - /* *r >= pivot) */ - while (likely(l <= r && !is_left(*r))) - { - //prefetchw(r-4); FIXME: enable? - reduction_t(rightReduction,*r); - --r; - } - if (r<l) break; - - reduction_t(leftReduction ,*r); - reduction_t(rightReduction,*l); - xchg(*l,*r); - l++; r--; - } - - return l - array; - } - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - class __aligned(64) parallel_partition_task - { - ALIGNED_CLASS_(64); - private: - - static const size_t MAX_TASKS = 64; - - T* array; - size_t N; - const IsLeft& is_left; - const Reduction_T& reduction_t; - const Reduction_V& reduction_v; - const Vi& identity; - - size_t numTasks; - __aligned(64) size_t counter_start[MAX_TASKS+1]; - __aligned(64) size_t counter_left[MAX_TASKS+1]; - __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; - __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; - __aligned(64) V leftReductions[MAX_TASKS]; - __aligned(64) V rightReductions[MAX_TASKS]; - - public: - - __forceinline parallel_partition_task(T* array, - const size_t N, - const Vi& identity, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - const size_t BLOCK_SIZE) - - : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), - numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} - - __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) - { - size_t i = 0; - while(index >= (size_t)r[i].size()) - { - assert(i < numRanges); - index -= (size_t)r[i].size(); - i++; - } - return &r[i]; - } - - __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, - const size_t numRightMisplacedRanges, - const size_t startID, - const size_t endID) - { - size_t leftLocalIndex = startID; - size_t rightLocalIndex = startID; - const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); - const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); - - size_t l_left = l_range->size() - leftLocalIndex; - size_t r_left = r_range->size() - rightLocalIndex; - T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; - T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; - size_t size = endID - startID; - size_t items = min(size,min(l_left,r_left)); - - while (size) - { - if (unlikely(l_left == 0)) - { - l_range++; - l_left = l_range->size(); - l = &array[l_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - if (unlikely(r_left == 0)) - { - r_range++; - r_left = r_range->size(); - r = &array[r_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - size -= items; - l_left -= items; - r_left -= items; - - while(items) { - items--; - xchg(*l++,*r++); - } - } - } - - __forceinline size_t partition(V& leftReduction, V& rightReduction) - { - /* partition the individual ranges for each task */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*N/numTasks; - const size_t endID = (taskID+1)*N/numTasks; - V local_left(identity); - V local_right(identity); - const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); - counter_start[taskID] = startID; - counter_left [taskID] = mid-startID; - leftReductions[taskID] = local_left; - rightReductions[taskID] = local_right; - }); - counter_start[numTasks] = N; - counter_left[numTasks] = 0; - - /* finalize the reductions */ - for (size_t i=0; i<numTasks; i++) { - reduction_v(leftReduction,leftReductions[i]); - reduction_v(rightReduction,rightReductions[i]); - } - - /* calculate mid point for partitioning */ - size_t mid = counter_left[0]; - for (size_t i=1; i<numTasks; i++) - mid += counter_left[i]; - const range<ssize_t> globalLeft (0,mid); - const range<ssize_t> globalRight(mid,N); - - /* calculate all left and right ranges that are on the wrong global side */ - size_t numMisplacedRangesLeft = 0; - size_t numMisplacedRangesRight = 0; - size_t numMisplacedItemsLeft = 0; - size_t numMisplacedItemsRight = 0; - - for (size_t i=0; i<numTasks; i++) - { - const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); - const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); - const range<ssize_t> left_misplaced = globalLeft. intersect(right_range); - const range<ssize_t> right_misplaced = globalRight.intersect(left_range); - - if (!left_misplaced.empty()) - { - numMisplacedItemsLeft += left_misplaced.size(); - leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; - } - - if (!right_misplaced.empty()) - { - numMisplacedItemsRight += right_misplaced.size(); - rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; - } - } - assert( numMisplacedItemsLeft == numMisplacedItemsRight ); - - /* if no items are misplaced we are done */ - if (numMisplacedItemsLeft == 0) - return mid; - - /* otherwise we copy the items to the right place in parallel */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; - const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; - swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); - }); - - return mid; - } - }; - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE = 128) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < BLOCK_SIZE)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; - std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE, - size_t PARALLEL_THRESHOLD) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < PARALLEL_THRESHOLD)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; - std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - - template<typename T, typename IsLeft> - inline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const IsLeft& is_left, - size_t BLOCK_SIZE = 128) - { - size_t leftReduction = 0; - size_t rightReduction = 0; - return parallel_partitioning( - array,begin,end,0,leftReduction,rightReduction,is_left, - [] (size_t& t,const T& ref) { }, - [] (size_t& t0,size_t& t1) { }, - BLOCK_SIZE); - } - -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp deleted file mode 100644 index 685952c3dc..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_prefix_sum_regression_test : public RegressionTest - { - parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* initialize array with random numbers */ - uint32_t sum0 = 0; - std::vector<uint32_t> src(N); - for (size_t i=0; i<N; i++) { - sum0 += src[i] = rand(); - } - - /* calculate parallel prefix sum */ - std::vector<uint32_t> dst(N); - for (auto& v : dst) v = 0; - - for (size_t i=0; i<M; i++) { - uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>()); - passed &= (sum0 == sum1); - } - - /* check if prefix sum is correct */ - for (size_t i=0, sum=0; i<N; sum+=src[i++]) - passed &= ((uint32_t)sum == dst[i]); - } - - return passed; - } - }; - - parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h deleted file mode 100644 index 117c7a79b0..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Value> - struct ParallelPrefixSumState - { - enum { MAX_TASKS = 64 }; - Value counts[MAX_TASKS]; - Value sums [MAX_TASKS]; - }; - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; - const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); - - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; - const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; - state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.counts[i]; - state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - /*! parallel calculation of prefix sums */ - template<typename SrcArray, typename DstArray, typename Value, typename Add> - __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) - { - /* perform single threaded prefix operation for small N */ - if (N < SINGLE_THREAD_THRESHOLD) - { - Value sum=identity; - for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; - return sum; - } - - /* perform parallel prefix operation for large N */ - else - { - ParallelPrefixSumState<Value> state; - - /* initial run just sets up start values for subtasks */ - parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); - return s; - - }, add); - - /* final run calculates prefix sum */ - return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i<r.end(); i++) { - dst[i] = add(sum,s); - s = add(s,src[i]); - } - return s; - - }, add); - } - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp deleted file mode 100644 index 331fe4288e..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_reduce.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_reduce_regression_test : public RegressionTest - { - parallel_reduce_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - const size_t M = 10; - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* sequentially calculate sum of squares */ - size_t sum0 = 0; - for (size_t i=0; i<N; i++) { - sum0 += i*i; - } - - /* parallel calculation of sum of squares */ - for (size_t m=0; m<M; m++) - { - size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) - s += i*i; - return s; - }, - [](const size_t v0, const size_t v1) { - return v0+v1; - }); - passed = sum0 == sum1; - } - } - return passed; - } - }; - - parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h deleted file mode 100644 index 0daf94e50e..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range<Index>(first,last)); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range<Index>(first,last)); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - const Index maxTasks = 512; - const Index threadCount = (Index) TaskScheduler::threadCount(); - taskCount = min(taskCount,threadCount,maxTasks); - - /* parallel invokation of all tasks */ - dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack - parallel_for(taskCount, [&](const Index taskIndex) { - const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; - const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; - values[taskIndex] = func(range<Index>(k0,k1)); - }); - - /* perform reduction over all tasks */ - Value v = identity; - for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]); - return v; - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { -#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS)) - - /* fast path for small number of iterations */ - Index taskCount = (last-first+minStepSize-1)/minStepSize; - if (likely(taskCount == 1)) { - return func(range<Index>(first,last)); - } - return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, - [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, - reduction,context); - // -- GODOT start -- - // if (context.is_group_execution_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #else - const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, - [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, - reduction); - // -- GODOT start -- - // if (tbb::task::self().is_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #endif -#else // TASKING_PPL - struct AlignedValue - { - char storage[__alignof(Value)+sizeof(Value)]; - static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; - Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } - const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } - AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } - AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } - AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; - AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - operator Value() const { return *getValuePtr(); } - }; - - struct Iterator_Index - { - Index v; - typedef std::forward_iterator_tag iterator_category; - typedef AlignedValue value_type; - typedef Index difference_type; - typedef Index distance_type; - typedef AlignedValue* pointer; - typedef AlignedValue& reference; - __forceinline Iterator_Index() {} - __forceinline Iterator_Index(Index v) : v(v) {} - __forceinline bool operator== (Iterator_Index other) { return v == other.v; } - __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } - __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } - __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } - }; - - auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { - assert(begin.v < end.v); - return reduction(start, func(range<Index>(begin.v, end.v))); - }; - const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); - return v; -#endif - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - if (likely(last-first < parallel_threshold)) { - return func(range<Index>(first,last)); - } else { - return parallel_reduce(first,last,minStepSize,identity,func,reduction); - } - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - auto funcr = [&] ( const range<Index> r ) { - Value v = identity; - for (Index i=r.begin(); i<r.end(); i++) - v = reduction(v,func(i)); - return v; - }; - return parallel_reduce(first,last,Index(1),identity,funcr,reduction); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp deleted file mode 100644 index 20b639c1c9..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_set.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_set_regression_test : public RegressionTest - { - parallel_set_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t N = 10000; - std::vector<uint32_t> unsorted(N); - for (size_t i=0; i<N; i++) unsorted[i] = 2*rand(); - - /* created set from numbers */ - parallel_set<uint32_t> sorted; - sorted.init(unsorted); - - /* check that all elements are in the set */ - for (size_t i=0; i<N; i++) { - passed &= sorted.lookup(unsorted[i]); - } - - /* check that these elements are not in the set */ - for (size_t i=0; i<N; i++) { - passed &= !sorted.lookup(unsorted[i]+1); - } - - return passed; - } - }; - - parallel_set_regression_test parallel_set_regression("parallel_set_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h deleted file mode 100644 index 640beba7ec..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_sort.h" - -namespace embree -{ - /* implementation of a set of values with parallel construction */ - template<typename T> - class parallel_set - { - public: - - /*! default constructor for the parallel set */ - parallel_set () {} - - /*! construction from vector */ - template<typename Vector> - parallel_set (const Vector& in) { init(in); } - - /*! initialized the parallel set from a vector */ - template<typename Vector> - void init(const Vector& in) - { - /* copy data to internal vector */ - vec.resize(in.size()); - parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) { - for (size_t i=r.begin(); i<r.end(); i++) - vec[i] = in[i]; - }); - - /* sort the data */ - std::vector<T> temp(in.size()); - radix_sort<T>(vec.data(),temp.data(),vec.size()); - } - - /*! tests if some element is in the set */ - __forceinline bool lookup(const T& elt) const { - return std::binary_search(vec.begin(), vec.end(), elt); - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector<T> vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp deleted file mode 100644 index 5e7ec79ac1..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_sort.h" -#include "../sys/regression.h" - -namespace embree -{ - template<typename Key> - struct RadixSortRegressionTest : public RegressionTest - { - RadixSortRegressionTest(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key)); - std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key)); - for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand()); - - /* calculate checksum */ - Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i]; - - /* sort numbers */ - for (size_t i=0; i<M; i++) { - radix_sort<Key>(src.data(),tmp.data(),N); - } - - /* calculate checksum */ - Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i]; - if (sum0 != sum1) passed = false; - - /* check if numbers are sorted */ - for (size_t i=1; i<N; i++) - passed &= src[i-1] <= src[i]; - } - - return passed; - } - }; - - RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32"); - RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h deleted file mode 100644 index a758227c1b..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h +++ /dev/null @@ -1,457 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../simd/simd.h" -#include "parallel_for.h" -#if defined(TASKING_GCD) && defined(BUILD_IOS) -#include "../sys/alloc.h" -#endif -#include <algorithm> - -namespace embree -{ - template<class T> - __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i<length;++i) - { - T v = array[i]; - size_t j = i; - while(j > 0 && v < array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template<class T> - __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i<length;++i) - { - T v = array[i]; - size_t j = i; - while(j > 0 && v > array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template<class T> - void quicksort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_ascending(t, begin, pivot); - quicksort_ascending(t, pivot + 1, end); - } - } - - template<class T> - void quicksort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_decending(t, begin, pivot); - quicksort_decending(t, pivot + 1, end); - } - } - - - template<class T, ssize_t THRESHOLD> - void quicksort_insertionsort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_ascending<T>(&t[begin],size); - } - else - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); - quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); - } - } - } - - - template<class T, ssize_t THRESHOLD> - void quicksort_insertionsort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_decending<T>(&t[begin],size); - } - else - { - - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); - quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); - } - } - } - - template<typename T> - static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) - { - static const unsigned int BITS = 8; - static const unsigned int BUCKETS = (1 << BITS); - static const unsigned int CMP_SORT_THRESHOLD = 16; - - __aligned(64) unsigned int count[BUCKETS]; - - /* clear buckets */ - for (size_t i=0;i<BUCKETS;i++) count[i] = 0; - - /* count buckets */ -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=0;i<num;i++) - count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; - - /* prefix sums */ - __aligned(64) unsigned int head[BUCKETS]; - __aligned(64) unsigned int tail[BUCKETS]; - - head[0] = 0; - for (size_t i=1; i<BUCKETS; i++) - head[i] = head[i-1] + count[i-1]; - - for (size_t i=0; i<BUCKETS-1; i++) - tail[i] = head[i+1]; - - tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; - - assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); - assert(tail[BUCKETS-1] == num); - - /* in-place swap */ - for (size_t i=0;i<BUCKETS;i++) - { - /* process bucket */ - while(head[i] < tail[i]) - { - T v = morton[head[i]]; - while(1) - { - const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); - if (b == i) break; - std::swap(v,morton[head[b]++]); - } - assert((unsigned(v) >> shift & (BUCKETS-1)) == i); - morton[head[i]++] = v; - } - } - if (shift == 0) return; - - size_t offset = 0; - for (size_t i=0;i<BUCKETS;i++) - if (count[i]) - { - - for (size_t j=offset;j<offset+count[i]-1;j++) - assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); - - if (unlikely(count[i] < CMP_SORT_THRESHOLD)) - insertionsort_ascending(morton + offset, count[i]); - else - radixsort32(morton + offset, count[i], shift-BITS); - - for (size_t j=offset;j<offset+count[i]-1;j++) - assert(morton[j] <= morton[j+1]); - - offset += count[i]; - } - } - - template<typename Ty, typename Key> - class ParallelRadixSort - { - static const size_t MAX_TASKS = 64; - static const size_t BITS = 8; - static const size_t BUCKETS = (1 << BITS); - typedef unsigned int TyRadixCount[BUCKETS]; - - template<typename T> - static bool compare(const T& v0, const T& v1) { - return (Key)v0 < (Key)v1; - } - - private: - ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement - ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement - - - public: - ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) - : radixCount(nullptr), src(src), tmp(tmp), N(N) {} - - void sort(const size_t blockSize) - { - assert(blockSize > 0); - - /* perform single threaded sort for small N */ - if (N<=blockSize) // handles also special case of 0! - { - /* do inplace sort inside destination array */ - std::sort(src,src+N,compare<Ty>); - } - - /* perform parallel sort for large N */ - else - { - const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); - tbbRadixSort(numThreads); - } - } - - ~ParallelRadixSort() - { - alignedFree(radixCount); - radixCount = nullptr; - } - - private: - - void tbbRadixIteration0(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* count how many items go into the buckets */ - for (size_t i=0; i<BUCKETS; i++) - radixCount[threadIndex][i] = 0; - - /* iterate over src array and count buckets */ - unsigned int * __restrict const count = radixCount[threadIndex]; -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=startID; i<endID; i++) { -#if defined(__X86_64__) || defined(__aarch64__) - const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; -#else - const Key index = ((Key)src[i] >> shift) & mask; -#endif - count[index]++; - } - } - - void tbbRadixIteration1(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* calculate total number of items for each bucket */ - __aligned(64) unsigned int total[BUCKETS]; - /* - for (size_t i=0; i<BUCKETS; i++) - total[i] = 0; - */ - for (size_t i=0; i<BUCKETS; i+=VSIZEX) - vintx::store(&total[i], zero); - - for (size_t i=0; i<threadCount; i++) - { - /* - for (size_t j=0; j<BUCKETS; j++) - total[j] += radixCount[i][j]; - */ - for (size_t j=0; j<BUCKETS; j+=VSIZEX) - vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); - } - - /* calculate start offset of each bucket */ - __aligned(64) unsigned int offset[BUCKETS]; - offset[0] = 0; - for (size_t i=1; i<BUCKETS; i++) - offset[i] = offset[i-1] + total[i-1]; - - /* calculate start offset of each bucket for this thread */ - for (size_t i=0; i<threadIndex; i++) - { - /* - for (size_t j=0; j<BUCKETS; j++) - offset[j] += radixCount[i][j]; - */ - for (size_t j=0; j<BUCKETS; j+=VSIZEX) - vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); - } - - /* copy items into their buckets */ -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=startID; i<endID; i++) { - const Ty elt = src[i]; -#if defined(__X86_64__) || defined(__aarch64__) - const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; -#else - const size_t index = ((Key)src[i] >> shift) & mask; -#endif - dst[offset[index]++] = elt; - } - } - - void tbbRadixIteration(const Key shift, const bool last, - const Ty* __restrict src, Ty* __restrict dst, - const size_t numTasks) - { - affinity_partitioner ap; - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); - } - - void tbbRadixSort(const size_t numTasks) - { - radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); - - if (sizeof(Key) == sizeof(uint32_t)) { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,1,tmp,src,numTasks); - } - else if (sizeof(Key) == sizeof(uint64_t)) - { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,0,tmp,src,numTasks); - tbbRadixIteration(4*BITS,0,src,tmp,numTasks); - tbbRadixIteration(5*BITS,0,tmp,src,numTasks); - tbbRadixIteration(6*BITS,0,src,tmp,numTasks); - tbbRadixIteration(7*BITS,1,tmp,src,numTasks); - } - } - - private: - TyRadixCount* radixCount; - Ty* const src; - Ty* const tmp; - const size_t N; - }; - - template<typename Ty> - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); - } - - template<typename Ty, typename Key> - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); - } - - template<typename Ty> - void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); - } - - template<typename Ty> - void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h deleted file mode 100644 index db46dc114f..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/parsestream.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stringstream.h" -#include "../sys/filename.h" -#include "../math/vec2.h" -#include "../math/vec3.h" -#include "../math/col3.h" -#include "../math/color.h" - -namespace embree -{ - /*! helper class for simple command line parsing */ - class ParseStream : public Stream<std::string> - { - public: - ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {} - - ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", - const std::string& endl = "", bool multiLine = false) - : cin(new StringStream(cin,seps,endl,multiLine)) {} - - public: - ParseLocation location() { return cin->loc(); } - std::string next() { return cin->get(); } - - void force(const std::string& next) { - std::string token = getString(); - if (token != next) - THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); - } - - std::string getString() { - return get(); - } - - FileName getFileName() { - return FileName(get()); - } - - int getInt () { - return atoi(get().c_str()); - } - - Vec2i getVec2i() { - int x = atoi(get().c_str()); - int y = atoi(get().c_str()); - return Vec2i(x,y); - } - - Vec3ia getVec3ia() { - int x = atoi(get().c_str()); - int y = atoi(get().c_str()); - int z = atoi(get().c_str()); - return Vec3ia(x,y,z); - } - - float getFloat() { - return (float)atof(get().c_str()); - } - - Vec2f getVec2f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - return Vec2f(x,y); - } - - Vec3f getVec3f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Vec3f(x,y,z); - } - - Vec3fa getVec3fa() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Vec3fa(x,y,z); - } - - Col3f getCol3f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Col3f(x,y,z); - } - - Color getColor() { - float r = (float)atof(get().c_str()); - float g = (float)atof(get().c_str()); - float b = (float)atof(get().c_str()); - return Color(r,g,b); - } - - private: - Ref<Stream<std::string> > cin; - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h deleted file mode 100644 index 3f75677e68..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stream.h +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/ref.h" -#include "../sys/filename.h" -#include "../sys/string.h" - -#include <vector> -#include <iostream> -#include <cstdio> -#include <string.h> - -namespace embree -{ - /*! stores the location of a stream element in the source */ - class ParseLocation - { - public: - ParseLocation () : lineNumber(-1), colNumber(-1) {} - ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) - : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} - - std::string str() const - { - std::string str = "unknown"; - if (fileName) str = *fileName; - if (lineNumber >= 0) str += " line " + toString(lineNumber); - if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); - return str; - } - - private: - std::shared_ptr<std::string> fileName; /// name of the file (or stream) the token is from - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - }; - - /*! a stream class templated over the stream elements */ - template<typename T> class Stream : public RefCount - { - enum { BUF_SIZE = 1024 }; - - private: - virtual T next() = 0; - virtual ParseLocation location() = 0; - __forceinline std::pair<T,ParseLocation> nextHelper() { - ParseLocation l = location(); - T v = next(); - return std::pair<T,ParseLocation>(v,l); - } - __forceinline void push_back(const std::pair<T,ParseLocation>& v) { - if (past+future == BUF_SIZE) pop_front(); - size_t end = (start+past+future++)%BUF_SIZE; - buffer[end] = v; - } - __forceinline void pop_front() { - if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); - start = (start+1)%BUF_SIZE; past--; - } - public: - Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} - virtual ~Stream() {} - - public: - - const ParseLocation& loc() { - if (future == 0) push_back(nextHelper()); - return buffer[(start+past)%BUF_SIZE].second; - } - T get() { - if (future == 0) push_back(nextHelper()); - T t = buffer[(start+past)%BUF_SIZE].first; - past++; future--; - return t; - } - const T& peek() { - if (future == 0) push_back(nextHelper()); - return buffer[(start+past)%BUF_SIZE].first; - } - const T& unget(size_t n = 1) { - if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); - past -= n; future += n; - return peek(); - } - void drop() { - if (future == 0) push_back(nextHelper()); - past++; future--; - } - private: - size_t start,past,future; - std::vector<std::pair<T,ParseLocation> > buffer; - }; - - /*! warps an iostream stream */ - class StdStream : public Stream<int> - { - public: - StdStream (std::istream& cin, const std::string& name = "std::stream") - : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} - ~StdStream() {} - ParseLocation location() { - return ParseLocation(name,lineNumber,colNumber,charNumber); - } - int next() { - int c = cin.get(); - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - private: - std::istream& cin; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - std::shared_ptr<std::string> name; /// name of buffer - }; - - /*! creates a stream from a file */ - class FileStream : public Stream<int> - { - public: - - FileStream (FILE* file, const std::string& name = "file") - : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} - - FileStream (const FileName& fileName) - : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str()))) - { - file = fopen(fileName.c_str(),"r"); - if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); - } - ~FileStream() { if (file) fclose(file); } - - public: - ParseLocation location() { - return ParseLocation(name,lineNumber,colNumber,charNumber); - } - - int next() { - int c = fgetc(file); - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - - private: - FILE* file; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - std::shared_ptr<std::string> name; /// name of buffer - }; - - /*! creates a stream from a string */ - class StrStream : public Stream<int> - { - public: - - StrStream (const char* str) - : str(str), lineNumber(1), colNumber(0), charNumber(0) {} - - public: - ParseLocation location() { - return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber); - } - - int next() { - int c = str[charNumber]; - if (c == 0) return EOF; - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - - private: - const char* str; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - }; - - /*! creates a character stream from a command line */ - class CommandLineStream : public Stream<int> - { - public: - CommandLineStream (int argc, char** argv, const std::string& name = "command line") - : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) - { - if (argc > 0) { - for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; - charNumber++; - } - for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]); - } - ~CommandLineStream() {} - public: - ParseLocation location() { - return ParseLocation(name,0,charNumber,charNumber); - } - int next() { - if (i == args.size()) return EOF; - if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; } - charNumber++; - return args[i][j++]; - } - private: - size_t i,j; - std::vector<std::string> args; - ssize_t charNumber; /// the character in the file - std::shared_ptr<std::string> name; /// name of buffer - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h deleted file mode 100644 index 25580a77b8..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/streamfilters.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" - -namespace embree -{ - /* removes all line comments from a stream */ - class LineCommentFilter : public Stream<int> - { - public: - LineCommentFilter (const FileName& fileName, const std::string& lineComment) - : cin(new FileStream(fileName)), lineComment(lineComment) {} - LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment) - : cin(cin), lineComment(lineComment) {} - - ParseLocation location() { return cin->loc(); } - - int next() - { - /* look if the line comment starts here */ - for (size_t j=0; j<lineComment.size(); j++) { - if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; } - cin->get(); - } - /* eat all characters until the end of the line (or file) */ - while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); - - not_found: - return cin->get(); - } - - private: - Ref<Stream<int> > cin; - std::string lineComment; - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp deleted file mode 100644 index 98dc80ad59..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "stringstream.h" - -namespace embree -{ - static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; - - /* creates map for fast categorization of characters */ - static void createCharMap(bool map[256], const std::string& chrs) { - for (size_t i=0; i<256; i++) map[i] = false; - for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; - } - - /* simple tokenizer */ - StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine) - : cin(cin), endl(endl), multiLine(multiLine) - { - createCharMap(isSepMap,seps); - createCharMap(isValidCharMap,stringChars); - } - - std::string StringStream::next() - { - /* skip separators */ - while (cin->peek() != EOF) { - if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } - if (multiLine && cin->peek() == '\\') { - cin->drop(); - if (cin->peek() == '\n') { cin->drop(); continue; } - cin->unget(); - } - if (!isSeparator(cin->peek())) break; - cin->drop(); - } - - /* parse everything until the next separator */ - std::vector<char> str; str.reserve(64); - while (cin->peek() != EOF && !isSeparator(cin->peek())) { - int c = cin->get(); - // -- GODOT start -- - // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); - if (!isValidChar(c)) abort(); - // -- GODOT end -- - str.push_back((char)c); - } - str.push_back(0); - return std::string(str.data()); - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h deleted file mode 100644 index e6dbd4aecc..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stringstream.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" - -namespace embree -{ - /*! simple tokenizer that produces a string stream */ - class StringStream : public Stream<std::string> - { - public: - StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", - const std::string& endl = "", bool multiLine = false); - public: - ParseLocation location() { return cin->loc(); } - std::string next(); - private: - __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } - __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } - private: - Ref<Stream<int> > cin; /*! source character stream */ - bool isSepMap[256]; /*! map for fast classification of separators */ - bool isValidCharMap[256]; /*! map for valid characters */ - std::string endl; /*! the token of the end of line */ - bool multiLine; /*! whether to parse lines wrapped with \ */ - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp deleted file mode 100644 index d05be65862..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "tokenstream.h" -#include "../math/math.h" - -namespace embree -{ - /* shorthands for common sets of characters */ - const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; - const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - const std::string TokenStream::numbers = "0123456789"; - const std::string TokenStream::separators = "\n\t\r "; - const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; - - /* creates map for fast categorization of characters */ - static void createCharMap(bool map[256], const std::string& chrs) { - for (size_t i=0; i<256; i++) map[i] = false; - for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; - } - - /* build full tokenizer that takes list of valid characters and keywords */ - TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from - const std::string& alpha, //< valid characters for identifiers - const std::string& seps, //< characters that act as separators - const std::vector<std::string>& symbols) //< symbols - : cin(cin), symbols(symbols) - { - createCharMap(isAlphaMap,alpha); - createCharMap(isSepMap,seps); - createCharMap(isStringCharMap,stringChars); - } - - bool TokenStream::decDigits(std::string& str_o) - { - bool ok = false; - std::string str; - if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); - while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } - if (ok) str_o += str; - else cin->unget(str.size()); - return ok; - } - - bool TokenStream::decDigits1(std::string& str_o) - { - bool ok = false; - std::string str; - while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } - if (ok) str_o += str; else cin->unget(str.size()); - return ok; - } - - bool TokenStream::trySymbol(const std::string& symbol) - { - size_t pos = 0; - while (pos < symbol.size()) { - if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } - cin->drop(); pos++; - } - return true; - } - - bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) - { - for (size_t i=0; i<symbols.size(); i++) { - if (!trySymbol(symbols[i])) continue; - token = Token(symbols[i],Token::TY_SYMBOL,loc); - return true; - } - return false; - } - - bool TokenStream::tryFloat(Token& token, const ParseLocation& loc) - { - bool ok = false; - std::string str; - if (trySymbol("nan")) { - token = Token(float(nan)); - return true; - } - if (trySymbol("+inf")) { - token = Token(float(pos_inf)); - return true; - } - if (trySymbol("-inf")) { - token = Token(float(neg_inf)); - return true; - } - - if (decDigits(str)) - { - if (cin->peek() == '.') { - str += (char)cin->get(); - decDigits(str); - if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // 1.[2]E2 - } - else ok = true; // 1.[2] - } - else if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // 1E2 - } - } - else - { - if (cin->peek() == '.') { - str += (char)cin->get(); - if (decDigits(str)) { - if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // .3E2 - } - else ok = true; // .3 - } - } - } - if (ok) { - token = Token((float)atof(str.c_str()),loc); - } - else cin->unget(str.size()); - return ok; - } - - bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { - std::string str; - if (decDigits(str)) { - token = Token(atoi(str.c_str()),loc); - return true; - } - return false; - } - - bool TokenStream::tryString(Token& token, const ParseLocation& loc) - { - std::string str; - if (cin->peek() != '\"') return false; - cin->drop(); - while (cin->peek() != '\"') { - const int c = cin->get(); - if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); - str += (char)c; - } - cin->drop(); - token = Token(str,Token::TY_STRING,loc); - return true; - } - - bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) - { - std::string str; - if (!isAlpha(cin->peek())) return false; - str += (char)cin->get(); - while (isAlphaNum(cin->peek())) str += (char)cin->get(); - token = Token(str,Token::TY_IDENTIFIER,loc); - return true; - } - - void TokenStream::skipSeparators() - { - /* skip separators */ - while (cin->peek() != EOF && isSeparator(cin->peek())) - cin->drop(); - } - - Token TokenStream::next() - { - Token token; - skipSeparators(); - ParseLocation loc = cin->loc(); - if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ - if (tryFloat (token,loc)) return token; /**< try to parse float */ - if (tryInt (token,loc)) return token; /**< try to parse integer */ - if (tryString (token,loc)) return token; /**< try to parse string */ - if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ - if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ - return Token((char)cin->get(),loc); /**< return invalid character token */ - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h deleted file mode 100644 index 72a7b4f2f3..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/tokenstream.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" -#include <string> -#include <vector> - -namespace embree -{ - /*! token class */ - class Token - { - public: - - enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; - - Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} - Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} - Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} - Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} - Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} - - static Token Eof() { return Token(); } - static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } - static Token Str(std::string str) { return Token(str,TY_STRING); } - static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } - - char Char() const { - if (ty == TY_CHAR) return c; - THROW_RUNTIME_ERROR(loc.str()+": character expected"); - } - - int Int() const { - if (ty == TY_INT) return i; - THROW_RUNTIME_ERROR(loc.str()+": integer expected"); - } - - float Float(bool cast = true) const { - if (ty == TY_FLOAT) return f; - if (ty == TY_INT && cast) return (float)i; - THROW_RUNTIME_ERROR(loc.str()+": float expected"); - } - - std::string Identifier() const { - if (ty == TY_IDENTIFIER) return str; - THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); - } - - std::string String() const { - if (ty == TY_STRING) return str; - THROW_RUNTIME_ERROR(loc.str()+": string expected"); - } - - std::string Symbol() const { - if (ty == TY_SYMBOL) return str; - THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); - } - - const ParseLocation& Location() const { return loc; } - - friend bool operator==(const Token& a, const Token& b) - { - if (a.ty != b.ty) return false; - if (a.ty == TY_CHAR) return a.c == b.c; - if (a.ty == TY_INT) return a.i == b.i; - if (a.ty == TY_FLOAT) return a.f == b.f; - if (a.ty == TY_IDENTIFIER) return a.str == b.str; - if (a.ty == TY_STRING) return a.str == b.str; - if (a.ty == TY_SYMBOL) return a.str == b.str; - return true; - } - - friend bool operator!=(const Token& a, const Token& b) { - return !(a == b); - } - - friend bool operator <( const Token& a, const Token& b ) { - if (a.ty != b.ty) return (int)a.ty < (int)b.ty; - if (a.ty == TY_CHAR) return a.c < b.c; - if (a.ty == TY_INT) return a.i < b.i; - if (a.ty == TY_FLOAT) return a.f < b.f; - if (a.ty == TY_IDENTIFIER) return a.str < b.str; - if (a.ty == TY_STRING) return a.str < b.str; - if (a.ty == TY_SYMBOL) return a.str < b.str; - return false; - } - - friend std::ostream& operator<<(std::ostream& cout, const Token& t) - { - if (t.ty == TY_EOF) return cout << "eof"; - if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; - if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; - if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; - if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; - if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; - if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; - return cout << "unknown"; - } - - private: - Type ty; //< the type of the token - union { - char c; //< data for char tokens - int i; //< data for int tokens - float f; //< data for float tokens - }; - std::string str; //< data for string and identifier tokens - ParseLocation loc; //< the location the token is from - }; - - /*! build full tokenizer that takes list of valid characters and keywords */ - class TokenStream : public Stream<Token> - { - public: - - /*! shorthands for common sets of characters */ - static const std::string alpha; - static const std::string ALPHA; - static const std::string numbers; - static const std::string separators; - static const std::string stringChars; - - public: - TokenStream(const Ref<Stream<int> >& cin, - const std::string& alpha, //< valid characters for identifiers - const std::string& seps, //< characters that act as separators - const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols - public: - ParseLocation location() { return cin->loc(); } - Token next(); - bool trySymbol(const std::string& symbol); - - private: - void skipSeparators(); - bool decDigits(std::string& str); - bool decDigits1(std::string& str); - bool trySymbols(Token& token, const ParseLocation& loc); - bool tryFloat(Token& token, const ParseLocation& loc); - bool tryInt(Token& token, const ParseLocation& loc); - bool tryString(Token& token, const ParseLocation& loc); - bool tryIdentifier(Token& token, const ParseLocation& loc); - - Ref<Stream<int> > cin; - bool isSepMap[256]; - bool isAlphaMap[256]; - bool isStringCharMap[256]; - std::vector<std::string> symbols; - - /*! checks if a character is a separator */ - __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } - - /*! checks if a character is a number */ - __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } - - /*! checks if a character is valid inside a string */ - __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } - - /*! checks if a character is legal for an identifier */ - __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } - __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } - }; -} diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h deleted file mode 100644 index e8698ac56d..0000000000 --- a/thirdparty/embree-aarch64/common/math/AVX2NEON.h +++ /dev/null @@ -1,986 +0,0 @@ -#pragma once - -#include "SSE2NEON.h" - - -#define AVX2NEON_ABI static inline __attribute__((always_inline)) - - -struct __m256d; - -struct __m256 { - __m128 lo,hi; - __m256() {} -}; - - - - -struct __m256i { - __m128i lo,hi; - explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {} - operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;} - __m256i() {} -}; - - - - -struct __m256d { - float64x2_t lo,hi; - __m256d() {} - __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} - __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} -}; - -#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;} - - -#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;} -#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;} - -#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;} - - -#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;} - - - -#define _mm_stream_load_si128 _mm_load_si128 -#define _mm256_stream_load_si256 _mm256_load_si256 - - -AVX2NEON_ABI -__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) -{ - __m128 res; - for (int i=0;i<4;i++) - { - if (imm8 & (1<<i)) - { - res[i] = b[i]; - } - else{ - res[i] = a[i]; - } - } - - return res; -} - -AVX2NEON_ABI -__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8) -{ - __m128i res; - for (int i=0;i<4;i++) - { - if (imm8 & (1<<i)) - { - res[i] = b[i]; - } - else{ - res[i] = a[i]; - } - } - return res; -} - -AVX2NEON_ABI -__m128 _mm_cmpngt_ps (__m128 a, __m128 b) -{ - return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b)))); -} - - -AVX2NEON_ABI -__m128i _mm_loadl_epi64 (__m128i const* mem_addr) -{ - int64x2_t y; - y[0] = *(int64_t *)mem_addr; - y[1] = 0; - return __m128i(y); -} - -AVX2NEON_ABI -int _mm_movemask_popcnt(__m128 a) -{ - return __builtin_popcount(_mm_movemask_ps(a)); -} - -AVX2NEON_ABI -__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask) -{ - __m128 res; - for (int i=0;i<4;i++) { - if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0; - } - return res; -} - -AVX2NEON_ABI -void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) -{ - for (int i=0;i<4;i++) { - if (mask[i] & 0x80000000) mem_addr[i] = a[i]; - } -} - -AVX2NEON_ABI -void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a) -{ - for (int i=0;i<4;i++) { - if (mask[i] & 0x80000000) mem_addr[i] = a[i]; - } -} - -AVX2NEON_ABI -__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c) -{ - return vnegq_f32(vfmaq_f32(c,a,b)); -} - -#define _mm_fnmsub_ss _mm_fnmsub_ps - -AVX2NEON_ABI -__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c) -{ - return vfmsq_f32(c,a,b); -} - -#define _mm_fnmadd_ss _mm_fnmadd_ps - - -AVX2NEON_ABI -__m128 _mm_broadcast_ss (float const * mem_addr) -{ - return vdupq_n_f32(*mem_addr); -} - - -AVX2NEON_ABI -__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) -{ - return vfmaq_f32(vnegq_f32(c),a,b); -} - -#define _mm_fmsub_ss _mm_fmsub_ps -#define _mm_fmadd_ps _mm_madd_ps -#define _mm_fmadd_ss _mm_madd_ps - - - -template<int code> -AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b) -{ - float v; - v = 0; - v += (code & 0x10) ? a[0]*b[0] : 0; - v += (code & 0x20) ? a[1]*b[1] : 0; - v += (code & 0x40) ? a[2]*b[2] : 0; - v += (code & 0x80) ? a[3]*b[3] : 0; - float32x4_t res; - res[0] = (code & 0x1) ? v : 0; - res[1] = (code & 0x2) ? v : 0; - res[2] = (code & 0x4) ? v : 0; - res[3] = (code & 0x8) ? v : 0; - return res; -} - -template<> -inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b) -{ - float v; - float32x4_t m = _mm_mul_ps(a,b); - m[3] = 0; - v = vaddvq_f32(m); - return _mm_set1_ps(v); -} - -template<> -inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b) -{ - float v; - float32x4_t m = _mm_mul_ps(a,b); - v = vaddvq_f32(m); - return _mm_set1_ps(v); -} - -#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b)) - - - -AVX2NEON_ABI -__m128 _mm_cmpnge_ps (__m128 a, __m128 b) -{ - return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b)))); -} - - -AVX2NEON_ABI -__m128 _mm_permutevar_ps (__m128 a, __m128i b) -{ - __m128 x; - for (int i=0;i<4;i++) - { - x[i] = a[b[i&3]]; - } - return x; -} - -AVX2NEON_ABI -__m256i _mm256_setzero_si256() -{ - __m256i res; - res.lo = res.hi = vdupq_n_s32(0); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_setzero_ps() -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(0.0f); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_undefined_si256() -{ - return _mm256_setzero_si256(); -} - -AVX2NEON_ABI -__m256 _mm256_undefined_ps() -{ - return _mm256_setzero_ps(); -} - -CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t) -CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i) -CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128) -CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128) -CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t) -CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i) - - - - -AVX2NEON_ABI -__m128 _mm256_castps256_ps128 (__m256 a) -{ - return a.lo; -} - -AVX2NEON_ABI -__m256i _mm256_castsi128_si256 (__m128i a) -{ - __m256i res; - res.lo = a ; - res.hi = vdupq_n_s32(0); - return res; -} - -AVX2NEON_ABI -__m128i _mm256_castsi256_si128 (__m256i a) -{ - return a.lo; -} - -AVX2NEON_ABI -__m256 _mm256_castps128_ps256 (__m128 a) -{ - __m256 res; - res.lo = a; - res.hi = vdupq_n_f32(0); - return res; -} - - -AVX2NEON_ABI -__m256 _mm256_broadcast_ss (float const * mem_addr) -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(*mem_addr); - return res; -} - - - -AVX2NEON_ABI -__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) -{ - __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7}; - __m256i res; - res.lo = lo; res.hi = hi; - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_set1_epi32 (int a) -{ - __m256i res; - res.lo = res.hi = vdupq_n_s32(a); - return res; -} - - - - -AVX2NEON_ABI -int _mm256_movemask_ps(const __m256& v) -{ - return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo); -} - -template<int imm8> -AVX2NEON_ABI -__m256 __mm256_permute_ps (const __m256& a) -{ - __m256 res; - res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8); - res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8); - return res; - -} - -#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a) - - -template<int imm8> -AVX2NEON_ABI -__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b) -{ - __m256 res; - res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8); - res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8); - return res; - -} - -#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b) - -AVX2NEON_ABI -__m256i _mm256_set1_epi64x (long long a) -{ - __m256i res; - int64x2_t t = vdupq_n_s64(a); - res.lo = res.hi = __m128i(t); - return res; -} - - -AVX2NEON_ABI -__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) -{ - __m256 res; - __m128 tmp; - switch (imm8 & 0x7) - { - case 0: tmp = a.lo; break; - case 1: tmp = a.hi; break; - case 2: tmp = b.lo; break; - case 3: tmp = b.hi; break; - } - if (imm8 & 0x8) - tmp = _mm_setzero_ps(); - - - - res.lo = tmp; - imm8 >>= 4; - - switch (imm8 & 0x7) - { - case 0: tmp = a.lo; break; - case 1: tmp = a.hi; break; - case 2: tmp = b.lo; break; - case 3: tmp = b.hi; break; - } - if (imm8 & 0x8) - tmp = _mm_setzero_ps(); - - res.hi = tmp; - - return res; -} - -AVX2NEON_ABI -__m256 _mm256_moveldup_ps (__m256 a) -{ - __m256 res; - res.lo[0] = res.lo[1] = a.lo[0]; - res.lo[2] = res.lo[3] = a.lo[2]; - res.hi[0] = res.hi[1] = a.hi[0]; - res.hi[2] = res.hi[3] = a.hi[2]; - return res; - -} - -AVX2NEON_ABI -__m256 _mm256_movehdup_ps (__m256 a) -{ - __m256 res; - res.lo[0] = res.lo[1] = a.lo[1]; - res.lo[2] = res.lo[3] = a.lo[3]; - res.hi[0] = res.hi[1] = a.hi[1]; - res.hi[2] = res.hi[3] = a.hi[3]; - return res; -} - -AVX2NEON_ABI -__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) -{ - __m256 res = a; - if (imm8 & 1) res.hi = b; - else res.lo = b; - return res; -} - - -AVX2NEON_ABI -__m128 _mm256_extractf128_ps (__m256 a, const int imm8) -{ - if (imm8 & 1) return a.hi; - return a.lo; -} - - -AVX2NEON_ABI -__m256d _mm256_movedup_pd (__m256d a) -{ - __m256d res; - res.hi = a.hi; - res.lo[0] = res.lo[1] = a.lo[0]; - return res; -} - -AVX2NEON_ABI -__m256i _mm256_abs_epi32(__m256i a) -{ - __m256i res; - res.lo = vabsq_s32(a.lo); - res.hi = vabsq_s32(a.hi); - return res; -} - -UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps) -UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps) -UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps) -UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32) -UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32) - - -BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32) -BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32) -BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32) - -BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32) -BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32) -BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t) -BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t) - -BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps) -BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps) - -BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps) -BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps) -BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps) -BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps) - -BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps) -BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps) -BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps) -BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps) - -BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t) -BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t) -BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t) - - - -BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128) -BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128) -BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128) - - -BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps) -BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps) -TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps) - - -TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps) -TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps) -TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps) -TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps) - - -BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32) -BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32) - - -BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32) -BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32) -BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps) -BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps) -BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps) -BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps) -BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps) - - -AVX2NEON_ABI -__m256i _mm256_cvtps_epi32 (__m256 a) -{ - __m256i res; - res.lo = _mm_cvtps_epi32(a.lo); - res.hi = _mm_cvtps_epi32(a.hi); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_cvttps_epi32 (__m256 a) -{ - __m256i res; - res.lo = _mm_cvttps_epi32(a.lo); - res.hi = _mm_cvttps_epi32(a.hi); - return res; - -} - -AVX2NEON_ABI -__m256 _mm256_loadu_ps (float const * mem_addr) -{ - __m256 res; - res.lo = *(__m128 *)(mem_addr + 0); - res.hi = *(__m128 *)(mem_addr + 4); - return res; -} -#define _mm256_load_ps _mm256_loadu_ps - - -AVX2NEON_ABI -int _mm256_testz_ps (const __m256& a, const __m256& b) -{ - __m256 t = a; - if (&a != &b) - t = _mm256_and_ps(a,b); - - __m128i l = vshrq_n_s32(__m128i(t.lo),31); - __m128i h = vshrq_n_s32(__m128i(t.hi),31); - return vaddvq_s32(vaddq_s32(l,h)) == 0; -} - - -AVX2NEON_ABI -__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) -{ - __m256i res; - int64x2_t t0 = {e0,e1}; - int64x2_t t1 = {e2,e3}; - res.lo = __m128i(t0); - res.hi = __m128i(t1); - return res; -} - -AVX2NEON_ABI -__m256d _mm256_setzero_pd () -{ - __m256d res; - res.lo = res.hi = vdupq_n_f64(0); - return res; -} - -AVX2NEON_ABI -int _mm256_movemask_pd (__m256d a) -{ - int res = 0; - uint64x2_t x; - x = uint64x2_t(a.lo); - res |= (x[0] >> 63) ? 1 : 0; - res |= (x[0] >> 63) ? 2 : 0; - x = uint64x2_t(a.hi); - res |= (x[0] >> 63) ? 4 : 0; - res |= (x[0] >> 63) ? 8 : 0; - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) -{ - __m256i res; - res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo))); - res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi))); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cmpeq_pd (__m256d a, __m256d b) -{ - __m256i res; - res.lo = __m128i(vceqq_f64(a.lo,b.lo)); - res.hi = __m128i(vceqq_f64(a.hi,b.hi)); - return res; -} - - -AVX2NEON_ABI -int _mm256_testz_pd (const __m256d& a, const __m256d& b) -{ - __m256d t = a; - - if (&a != &b) - t = _mm256_and_pd(a,b); - - return _mm256_movemask_pd(t) == 0; -} - -AVX2NEON_ABI -__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) -{ - __m256d res; - uint64x2_t t = uint64x2_t(mask.lo); - res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0]; - res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1]; - t = uint64x2_t(mask.hi); - res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0]; - res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1]; - return res; -} - -template<int imm8> -__m256 __mm256_dp_ps (__m256 a, __m256 b) -{ - __m256 res; - res.lo = _mm_dp_ps(a.lo,b.lo,imm8); - res.hi = _mm_dp_ps(a.hi,b.hi,imm8); - return res; -} - -#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b) - -AVX2NEON_ABI -double _mm256_permute4x64_pd_select(__m256d a, const int imm8) -{ - switch (imm8 & 3) { - case 0: - return a.lo[0]; - case 1: - return a.lo[1]; - case 2: - return a.hi[0]; - case 3: - return a.hi[1]; - } - __builtin_unreachable(); - return 0; -} - -AVX2NEON_ABI -__m256d _mm256_permute4x64_pd (__m256d a, const int imm8) -{ - __m256d res; - res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0); - res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2); - res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4); - res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6); - - return res; -} - -AVX2NEON_ABI -__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) -{ - return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8)); -} - - -AVX2NEON_ABI -__m256i _mm256_loadu_si256 (__m256i const * mem_addr) -{ - __m256i res; - res.lo = *(__m128i *)((int32_t *)mem_addr + 0); - res.hi = *(__m128i *)((int32_t *)mem_addr + 4); - return res; -} - -#define _mm256_load_si256 _mm256_loadu_si256 - -AVX2NEON_ABI -void _mm256_storeu_ps (float * mem_addr, __m256 a) -{ - *(__m128 *)(mem_addr + 0) = a.lo; - *(__m128 *)(mem_addr + 4) = a.hi; - -} - -#define _mm256_store_ps _mm256_storeu_ps -#define _mm256_stream_ps _mm256_storeu_ps - - -AVX2NEON_ABI -void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) -{ - *(__m128i *)((int *)mem_addr + 0) = a.lo; - *(__m128i *)((int *)mem_addr + 4) = a.hi; - -} - -#define _mm256_store_si256 _mm256_storeu_si256 - - - -AVX2NEON_ABI -__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) -{ - __m256 res; - res.lo = _mm_maskload_ps(mem_addr,mask.lo); - res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepu8_epi32 (__m128i a) -{ - __m256i res; - uint8x16_t x = uint8x16_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepi8_epi32 (__m128i a) -{ - __m256i res; - int8x16_t x = int8x16_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepu16_epi32 (__m128i a) -{ - __m256i res; - uint16x8_t x = uint16x8_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cvtepi16_epi32 (__m128i a) -{ - __m256i res; - int16x8_t x = int16x8_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - - -AVX2NEON_ABI -void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) -{ - _mm_maskstore_epi32(mem_addr,mask.lo,a.lo); - _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi); -} - -AVX2NEON_ABI -__m256i _mm256_slli_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_slli_epi32(a.lo,imm8); - res.hi = _mm_slli_epi32(a.hi,imm8); - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_srli_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_srli_epi32(a.lo,imm8); - res.hi = _mm_srli_epi32(a.hi,imm8); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_srai_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_srai_epi32(a.lo,imm8); - res.hi = _mm_srai_epi32(a.hi,imm8); - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = vshlq_s32(a.lo,count.lo); - res.hi = vshlq_s32(a.hi,count.hi); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_srav_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo)); - res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi)); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo))); - res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi))); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) -{ - return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); -} - - -AVX2NEON_ABI -__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) -{ - if (imm8 & 1) return a.hi; - return a.lo; -} - -AVX2NEON_ABI -__m256 _mm256_set1_ps(float x) -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(x); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) -{ - __m256 res; - res.lo = _mm_set_ps(e3,e2,e1,e0); - res.hi = _mm_set_ps(e7,e6,e5,e4); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_broadcast_ps (__m128 const * mem_addr) -{ - __m256 res; - res.lo = res.hi = *mem_addr; - return res; -} - -AVX2NEON_ABI -__m256 _mm256_cvtepi32_ps (__m256i a) -{ - __m256 res; - res.lo = _mm_cvtepi32_ps(a.lo); - res.hi = _mm_cvtepi32_ps(a.hi); - return res; -} -AVX2NEON_ABI -void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) -{ - for (int i=0;i<4;i++) { - if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i]; - if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i]; - } -} - -AVX2NEON_ABI -__m256d _mm256_andnot_pd (__m256d a, __m256d b) -{ - __m256d res; - res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo))); - res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi))); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) -{ - __m256 res; - res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf); - res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) -{ - __m256i res; - res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf); - res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) -{ - __m256i res; - for (int i=0;i<4;i++) - { - res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); - res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) -{ - __m256i res = _mm256_setzero_si256(); - for (int i=0;i<4;i++) - { - if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); - if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); - } - - return res; - -} - - diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h deleted file mode 100644 index 2013151d31..0000000000 --- a/thirdparty/embree-aarch64/common/math/SSE2NEON.h +++ /dev/null @@ -1,1753 +0,0 @@ -#ifndef SSE2NEON_H -#define SSE2NEON_H - -// This header file provides a simple API translation layer -// between SSE intrinsics to their corresponding ARM NEON versions -// -// This header file does not (yet) translate *all* of the SSE intrinsics. -// Since this is in support of a specific porting effort, I have only -// included the intrinsics I needed to get my port to work. -// -// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com -// -// If you want to improve or add to this project, send me an -// email and I will probably approve your access to the depot. -// -// Project is located here: -// -// https://github.com/jratcliff63367/sse2neon -// -// Show your appreciation for open source by sending me a bitcoin tip to the following -// address. -// -// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p : -// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p -// -// -// Contributors to this project are: -// -// John W. Ratcliff : jratcliffscarab@gmail.com -// Brandon Rowlett : browlett@nvidia.com -// Ken Fast : kfast@gdeb.com -// Eric van Beurden : evanbeurden@nvidia.com -// -// -// ********************************************************************************************************************* -// Release notes for January 20, 2017 version: -// -// The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition -// The unit-tests now test 10,000 random float and int values against each intrinsic. -// -// SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and -// fully tested on NEON/ARM. The remaining 56 still need unit tests implemented. -// -// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which -// attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128 -// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx -// -// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer -// can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot -// cast or otherwise alias the base NEON data type for intrinsic operations. -// -// A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with -// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing -// to return the correct value. This is now fixed. -// -// A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers. -// It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int -// is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does. -// As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get -// a build error. -// -// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are -// producing the correct results on NEON. These unit tests will be added as soon as possible. -// -// Here is the list of new instrinsics which have been added: -// -// _mm_cvtss_f32 : extracts the lower order floating point value from the parameter -// _mm_add_ss : adds the scalar single - precision floating point values of a and b -// _mm_div_ps : Divides the four single - precision, floating - point values of a and b. -// _mm_div_ss : Divides the scalar single - precision floating point value of a by b. -// _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in. -// _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in. -// _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation -// _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation. -// _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation. -// _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation. -// _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation. -// _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation -// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b. -// _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b. -// -// ********************************************************************************************************************* -/* -** The MIT license: -** -** Permission is hereby granted, free of charge, to any person obtaining a copy -** of this software and associated documentation files (the "Software"), to deal -** in the Software without restriction, including without limitation the rights -** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -** copies of the Software, and to permit persons to whom the Software is furnished -** to do so, subject to the following conditions: -** -** The above copyright notice and this permission notice shall be included in all -** copies or substantial portions of the Software. - -** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#pragma once - -#define GCC 1 -#define ENABLE_CPP_VERSION 0 - -// enable precise emulation of _mm_min_ps and _mm_max_ps? -// This would slow down the computation a bit, but gives consistent result with x86 SSE2. -// (e.g. would solve a hole or NaN pixel in the rendering result) -#define USE_PRECISE_MINMAX_IMPLEMENTATION (1) - -#if GCC -#define FORCE_INLINE inline __attribute__((always_inline)) -#define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#else -#define FORCE_INLINE inline -#define ALIGN_STRUCT(x) __declspec(align(x)) -#endif - -#include <stdint.h> -#include "arm_neon.h" -#if defined(__aarch64__) -#include "constants.h" -#endif - - -#if !defined(__has_builtin) -#define __has_builtin(x) (0) -#endif - -/*******************************************************/ -/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ -/* Argument fp3 is a digit[0123] that represents the fp*/ -/* from argument "b" of mm_shuffle_ps that will be */ -/* placed in fp3 of result. fp2 is the same for fp2 in */ -/* result. fp1 is a digit[0123] that represents the fp */ -/* from argument "a" of mm_shuffle_ps that will be */ -/* places in fp1 of result. fp0 is the same for fp0 of */ -/* result */ -/*******************************************************/ -#if defined(__aarch64__) -#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) -#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } ) -#endif - -#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \ - ((fp1) << 2) | ((fp0))) - -typedef float32x4_t __m128; -typedef int32x4_t __m128i; - -// union intended to allow direct access to an __m128 variable using the names that the MSVC -// compiler provides. This union should really only be used when trying to access the members -// of the vector as integer values. GCC/clang allow native access to the float members through -// a simple array access operator (in C since 4.6, in C++ since 4.8). -// -// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance -// hit. If it really is needed however, the original __m128 variable can be aliased with a -// pointer to this union and used to access individual components. The use of this union should -// be hidden behind a macro that is used throughout the codebase to access the members instead -// of always declaring this type of variable. -typedef union ALIGN_STRUCT(16) SIMDVec -{ - float m128_f32[4]; // as floats - do not to use this. Added for convenience. - int8_t m128_i8[16]; // as signed 8-bit integers. - int16_t m128_i16[8]; // as signed 16-bit integers. - int32_t m128_i32[4]; // as signed 32-bit integers. - int64_t m128_i64[2]; // as signed 64-bit integers. - uint8_t m128_u8[16]; // as unsigned 8-bit integers. - uint16_t m128_u16[8]; // as unsigned 16-bit integers. - uint32_t m128_u32[4]; // as unsigned 32-bit integers. - uint64_t m128_u64[2]; // as unsigned 64-bit integers. - double m128_f64[2]; // as signed double -} SIMDVec; - -// ****************************************** -// CPU stuff -// ****************************************** - -typedef SIMDVec __m128d; - -#include <stdlib.h> - -#ifndef _MM_MASK_MASK -#define _MM_MASK_MASK 0x1f80 -#define _MM_MASK_DIV_ZERO 0x200 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_DENORMALS_ZERO_ON 0x40 -#define _MM_MASK_DENORM 0x100 -#endif -#define _MM_SET_EXCEPTION_MASK(x) -#define _MM_SET_FLUSH_ZERO_MODE(x) -#define _MM_SET_DENORMALS_ZERO_MODE(x) - -FORCE_INLINE void _mm_pause() -{ -} - -FORCE_INLINE void _mm_mfence() -{ - __sync_synchronize(); -} - -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 -#define _MM_HINT_NTA 0 - -FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level) -{ - __builtin_prefetch(ptr); - -} - -FORCE_INLINE void* _mm_malloc(int size, int align) -{ - void *ptr; - // align must be multiple of sizeof(void *) for posix_memalign. - if (align < sizeof(void *)) { - align = sizeof(void *); - } - - if ((align % sizeof(void *)) != 0) { - // fallback to malloc - ptr = malloc(size); - } else { - if (posix_memalign(&ptr, align, size)) { - return 0; - } - } - - return ptr; -} - -FORCE_INLINE void _mm_free(void* ptr) -{ - free(ptr); -} - -FORCE_INLINE int _mm_getcsr() -{ - return 0; -} - -FORCE_INLINE void _mm_setcsr(int val) -{ - return; -} - -// ****************************************** -// Set/get methods -// ****************************************** - -// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396 -#if defined(__aarch64__) -FORCE_INLINE float _mm_cvtss_f32(const __m128& x) -{ - return x[0]; -} -#else -FORCE_INLINE float _mm_cvtss_f32(__m128 a) -{ - return vgetq_lane_f32(a, 0); -} -#endif - -// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx -FORCE_INLINE __m128i _mm_setzero_si128() -{ - return vdupq_n_s32(0); -} - -// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setzero_ps(void) -{ - return vdupq_n_f32(0); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set1_ps(float _w) -{ - return vdupq_n_f32(_w); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps1(float _w) -{ - return vdupq_n_f32(_w); -} - -// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x) -{ - float32x4_t t = { x, y, z, w }; - return t; -} - -// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x ) -{ - float32x4_t t = { w, z, y, x }; - return t; -} -#else -FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) -{ - float __attribute__((aligned(16))) data[4] = { x, y, z, w }; - return vld1q_f32(data); -} - -// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) -{ - float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; - return vld1q_f32(data); -} -#endif - -// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi32(int _i) -{ - return vdupq_n_s32(_i); -} - -//Set the first lane to of 4 signed single-position, floating-point number to w -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_set_ss(float _w) -{ - float32x4_t res = {_w, 0, 0, 0}; - return res; -} - -// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32x4_t t = {i0,i1,i2,i3}; - return t; -} -#else -FORCE_INLINE __m128 _mm_set_ss(float _w) -{ - __m128 val = _mm_setzero_ps(); - return vsetq_lane_f32(_w, val, 0); -} - -// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 }; - return vld1q_s32(data); -} -#endif - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx -FORCE_INLINE void _mm_store_ps(float *p, __m128 a) -{ - vst1q_f32(p, a); -} - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) -{ - vst1q_f32(p, a); -} - -FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t*) p,a); -} - -// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a ) -{ - vst1q_s32((int32_t*) p,a); -} - -// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx -FORCE_INLINE void _mm_store_ss(float *p, __m128 a) -{ - vst1q_lane_f32(p, a, 0); -} - -// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx -FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b) -{ - *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0); -} - -// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load1_ps(const float * p) -{ - return vld1q_dup_f32(p); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load_ps(const float * p) -{ - return vld1q_f32(p); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_loadu_ps(const float * p) -{ - // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon - return vld1q_f32(p); -} - -// Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_load_ss(const float * p) -{ - __m128 result = vdupq_n_f32(0); - return vsetq_lane_f32(*p, result, 0); -} - -FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p) -{ - return (__m128i)vld1q_s32((const int32_t*) p); -} - - -// ****************************************** -// Logic/Binary operations -// ****************************************** - -// Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) -{ - return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b)); -} - -// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx -FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) -{ - return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) -{ - return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) -{ - return (__m128i)vandq_s32(a, b); -} - -// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) -{ - return (__m128)vandq_s32((__m128i)a, (__m128i)b); -} - -// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx -FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) -{ - return (__m128)vorrq_s32((__m128i)a, (__m128i)b); -} - -// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) -{ - return (__m128)veorq_s32((__m128i)a, (__m128i)b); -} - -// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx -FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) -{ - return (__m128i)vorrq_s32(a, b); -} - -// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx -FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) -{ - return veorq_s32(a, b); -} - -// NEON does not provide this method -// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_ps(__m128 a) -{ -#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this - uint32x4_t &ia = *(uint32x4_t *)&a; - return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8); -#else - -#if defined(__aarch64__) - uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); - return vaddvq_u32(t2); -#else - static const uint32x4_t movemask = { 1, 2, 4, 8 }; - static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; - uint32x4_t t0 = vreinterpretq_u32_f32(a); - uint32x4_t t1 = vtstq_u32(t0, highbit); - uint32x4_t t2 = vandq_u32(t1, movemask); - uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); - return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); -#endif - -#endif -} - -#if defined(__aarch64__) -FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a) -{ - uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); - t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2))); - return vaddvq_u32(t2); - -} -#endif - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) -{ - return vcombine_f32(vget_high_f32(a), vget_low_f32(b)); -} - -// takes the lower two 32-bit values from a and swaps them and places in high end of result -// takes the higher two 32 bit values from b and swaps them and places in low end of result. -FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) -{ - return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b))); -} - -// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high -FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) -{ - return vcombine_f32(vget_low_f32(a), vget_high_f32(b)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) -{ - float32_t a0 = vgetq_lane_f32(a, 0); - float32_t a2 = vgetq_lane_f32(a, 2); - float32x2_t aVal = vdup_n_f32(a2); - aVal = vset_lane_f32(a0, aVal, 1); - return vcombine_f32(aVal, vget_high_f32(b)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vget_low_f32(a), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vget_high_f32(a), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) -{ - float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3)); - float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3)); - return vcombine_f32(a21, b03); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) -{ - float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3)); - float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3)); - return vcombine_f32(a03, b21); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(a); - float32x2_t b10 = vget_low_f32(b); - return vcombine_f32(a10, b10); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(a)); - float32x2_t b10 = vget_low_f32(b); - return vcombine_f32(a01, b10); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(a)); - float32x2_t b01 = vrev64_f32(vget_low_f32(b)); - return vcombine_f32(a01, b01); -} - -// NEON does not support a general purpose permute intrinsic -// Currently I am not sure whether the C implementation is faster or slower than the NEON version. -// Note, this has to be expanded as a template because the shuffle value must be an immediate value. -// The same is true on SSE as well. -// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx -template <int i> -FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b) -{ -#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet. - __m128 ret; - ret[0] = a[i & 0x3]; - ret[1] = a[(i >> 2) & 0x3]; - ret[2] = b[(i >> 4) & 0x03]; - ret[3] = b[(i >> 6) & 0x03]; - return ret; -#else -# if __has_builtin(__builtin_shufflevector) - return __builtin_shufflevector( \ - a, b, (i) & (0x3), ((i) >> 2) & 0x3, - (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4); -# else - const int i0 = (i >> 0)&0x3; - const int i1 = (i >> 2)&0x3; - const int i2 = (i >> 4)&0x3; - const int i3 = (i >> 6)&0x3; - - if (&a == &b) - { - if (i0 == i1 && i0 == i2 && i0 == i3) - { - return (float32x4_t)vdupq_laneq_f32(a,i0); - } - static const uint8_t tbl[16] = { - (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, - (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, - (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3, - (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3 - }; - - return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl); - - } - else - { - - static const uint8_t tbl[16] = { - (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, - (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, - (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16, - (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16 - }; - - return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl)); - } -# endif //builtin(shufflevector) -#endif -} - -template <int i > -FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b) -{ - switch (i) - { - case _MM_SHUFFLE(1, 0, 3, 2): - return _mm_shuffle_ps_1032(a, b); - break; - case _MM_SHUFFLE(2, 3, 0, 1): - return _mm_shuffle_ps_2301(a, b); - break; - case _MM_SHUFFLE(3, 2, 1, 0): - return _mm_shuffle_ps_3210(a, b); - break; - case _MM_SHUFFLE(0, 0, 1, 1): - return _mm_shuffle_ps_0011(a, b); - break; - case _MM_SHUFFLE(0, 0, 2, 2): - return _mm_shuffle_ps_0022(a, b); - break; - case _MM_SHUFFLE(2, 2, 0, 0): - return _mm_shuffle_ps_2200(a, b); - break; - case _MM_SHUFFLE(3, 2, 0, 2): - return _mm_shuffle_ps_3202(a, b); - break; - case _MM_SHUFFLE(1, 1, 3, 3): - return _mm_shuffle_ps_1133(a, b); - break; - case _MM_SHUFFLE(2, 0, 1, 0): - return _mm_shuffle_ps_2010(a, b); - break; - case _MM_SHUFFLE(2, 0, 0, 1): - return _mm_shuffle_ps_2001(a, b); - break; - case _MM_SHUFFLE(2, 0, 3, 2): - return _mm_shuffle_ps_2032(a, b); - break; - case _MM_SHUFFLE(0, 3, 2, 1): - return _mm_shuffle_ps_0321(a, b); - break; - case _MM_SHUFFLE(2, 1, 0, 3): - return _mm_shuffle_ps_2103(a, b); - break; - case _MM_SHUFFLE(1, 0, 1, 0): - return _mm_shuffle_ps_1010(a, b); - break; - case _MM_SHUFFLE(1, 0, 0, 1): - return _mm_shuffle_ps_1001(a, b); - break; - case _MM_SHUFFLE(0, 1, 0, 1): - return _mm_shuffle_ps_0101(a, b); - break; - } - return _mm_shuffle_ps_default<i>(a, b); -} - -# if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default<i>(a,b) -# else -#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function<i>(a,b) -#endif - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b) -{ - return vcombine_s32(vget_high_s32(a), vget_low_s32(b)); -} - -// takes the lower two 32-bit values from a and swaps them and places in low end of result -// takes the higher two 32 bit values from b and swaps them and places in high end of result. -FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b))); -} - -// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b -// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down -FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b) -{ - return vextq_s32(a, b, 1); -} - -// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a -// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up -FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b) -{ - return vextq_s32(a, b, 3); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of b and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b) -{ - return vcombine_s32(vget_low_s32(a), vget_low_s32(a)); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits -// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b) -{ - return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b) -{ - return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b) -{ - return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3))); -} - -template <int i > -FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b) -{ -#if ENABLE_CPP_VERSION - __m128i ret; - ret[0] = a[i & 0x3]; - ret[1] = a[(i >> 2) & 0x3]; - ret[2] = b[(i >> 4) & 0x03]; - ret[3] = b[(i >> 6) & 0x03]; - return ret; -#else - __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3)); - ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1); - ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2); - ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3); - return ret; -#endif -} - -template <int i > -FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b) -{ - switch (i) - { - case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break; - case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break; - case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break; - case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break; - case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break; - case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break; - case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break; - case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break; - case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break; - case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break; - default: return _mm_shuffle_epi32_default<i>(a, b); - } -} - -template <int i > -FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a) -{ - return vdupq_n_s32(vgetq_lane_s32(a, i)); -} - -template <int i> -FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a) -{ - switch (i) - { - case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break; - case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break; - case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break; - case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break; - default: return _mm_shuffle_epi32_function<i>(a, a); - } -} - -// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx -#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single<i>(a) - -template <int i> -FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a) -{ - int16x8_t ret = (int16x8_t)a; - int16x4_t highBits = vget_high_s16(ret); - ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7); - return (__m128i)ret; -} - -// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx -#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function<i>(a) - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx -//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int32x4_t s = vdupq_n_s32(imm8); - return vshlq_s32(a, s); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - const int s = (imm8 > 31) ? 0 : imm8; - data[0] = data[0] << s; - data[1] = data[1] << s; - data[2] = data[2] << s; - data[3] = data[3] << s; - - return vld1q_s32(data); -#endif -} - - -//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx -//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int shift = (imm8 > 31) ? 0 : imm8; // Unfortunately, we need to check for this case for embree. - const int32x4_t s = vdupq_n_s32(-shift); - return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s)); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - - const int s = (imm8 > 31) ? 0 : imm8; - - data[0] = data[0] >> s; - data[1] = data[1] >> s; - data[2] = data[2] >> s; - data[3] = data[3] >> s; - - return vld1q_s32(data); -#endif -} - - -// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx -//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int32x4_t s = vdupq_n_s32(-imm8); - return vshlq_s32(a, s); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - const uint32_t m = (uint32_t) ((~0U) << (32 - imm8)); - - for (int i = 0; i < 4; i++) { - uint32_t is_neg = ((uint32_t) (((data[i]) >> 31))); - data[i] = (data[i] >> imm8) | (m * is_neg); - } - - return vld1q_s32(data); -#endif -} - -// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx -//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm)) -#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm)) - -// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx -#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm)) - -// NEON does not provide a version of this function, here is an article about some ways to repro the results. -// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon -// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_epi8(__m128i _a) -{ - uint8x16_t input = (uint8x16_t)_a; - const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; - uint8x8_t mask_and = vdup_n_u8(0x80); - int8x8_t mask_shift = vld1_s8(xr); - - uint8x8_t lo = vget_low_u8(input); - uint8x8_t hi = vget_high_u8(input); - - lo = vand_u8(lo, mask_and); - lo = vshl_u8(lo, mask_shift); - - hi = vand_u8(hi, mask_and); - hi = vshl_u8(hi, mask_shift); - - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - - return ((hi[0] << 8) | (lo[0] & 0xFF)); -} - - -// ****************************************** -// Math operations -// ****************************************** - -// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) -{ - return vsubq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) -{ - return vsubq_f32(a, b); -} - -// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx -FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) -{ - return vsubq_s32(a, b); -} - -// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) -{ - return vaddq_f32(a, b); -} - -// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) -{ - const float32_t b0 = vgetq_lane_f32(b, 0); - float32x4_t value = vdupq_n_f32(0); - - //the upper values in the result must be the remnants of <a>. - value = vsetq_lane_f32(b0, value, 0); - return vaddq_f32(a, value); -} - -// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) -{ - return vaddq_s32(a, b); -} - -// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) -{ - return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b); -} - -// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) -{ - return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b); -} - -// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b) -{ - return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b); -} - -// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) -{ - return vmulq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) -{ - return vmulq_f32(a, b); -} - -// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) -{ -#if defined(BUILD_IOS) - return vdivq_f32(vdupq_n_f32(1.0f),in); - -#endif - // Get an initial estimate of 1/in. - float32x4_t reciprocal = vrecpeq_f32(in); - - // We only return estimated 1/in. - // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps(). - - // TODO(LTE): We could delete these ifdef? - reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); - return reciprocal; - -} - -FORCE_INLINE __m128 _mm_rcp_ss(__m128 in) -{ - float32x4_t value; - float32x4_t result = in; - - value = _mm_rcp_ps(in); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) -{ -#if defined(BUILD_IOS) - return vdivq_f32(a,b); -#else - float32x4_t reciprocal = _mm_rcp_ps(b); - - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp. - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - // Another round for safety - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - - return vmulq_f32(a, reciprocal); -#endif -} - -// Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - value = _mm_div_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) -{ - - float32x4_t value = vrsqrteq_f32(in); - - // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values. - // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html - // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps - - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - // one more round to get better precision - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - // another round for safety - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - return value; -} - -FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) -{ - float32x4_t result = in; - - __m128 value = _mm_rsqrt_ps(in); - - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - - -// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) -{ -#if defined(BUILD_IOS) - return vsqrtq_f32(in); -#else - __m128 reciprocal = _mm_rsqrt_ps(in); - - // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf. - // We assign 0 to reciprocal wherever required. - const float32x4_t vzero = vdupq_n_f32(0.0f); - const uint32x4_t mask = vceqq_f32(in, vzero); - reciprocal = vbslq_f32(mask, vzero, reciprocal); - - // sqrt(x) = x * (1 / sqrt(x)) - return vmulq_f32(in, reciprocal); -#endif -} - -// Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) -{ - float32x4_t value; - float32x4_t result = in; - - value = _mm_sqrt_ps(in); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - - -// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) -{ -#if USE_PRECISE_MINMAX_IMPLEMENTATION - return vbslq_f32(vcltq_f32(b,a),a,b); -#else - // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) - return vmaxq_f32(a, b); -#endif -} - -// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) -{ -#if USE_PRECISE_MINMAX_IMPLEMENTATION - return vbslq_f32(vcltq_f32(a,b),a,b); -#else - // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) - return vminq_f32(a, b); -#endif -} - -// Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - - value = _mm_max_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - - - value = _mm_min_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) -{ - return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b); -} - -// epi versions of min/max -// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b ) -{ - return vmaxq_s32(a,b); -} - -// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b ) -{ - return vminq_s32(a,b); -} - -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) -{ - int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b); - ret = vshrq_n_s16(ret, 1); - return (__m128i)ret; -} - -// Computes pairwise add of each argument as single-precision, floating-point values a and b. -//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx -FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) -{ -#if defined(__aarch64__) - return vpaddq_f32(a,b); -#else -// This does not work, no vpaddq... -// return (__m128) vpaddq_f32(a,b); - // - // get two f32x2_t values from a - // do vpadd - // put result in low half of f32x4 result - // - // get two f32x2_t values from b - // do vpadd - // put result in high half of f32x4 result - // - // combine - return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) ); -#endif -} - -// ****************************************** -// Compare operations -// ****************************************** - -// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) -{ - return (__m128)vcltq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) -{ - return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b)); -} - -// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) -{ - return (__m128)vcgtq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) -{ - return (__m128) _mm_cmpgt_ps(a,b); -} - - -// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) -{ - return (__m128)vcgeq_f32(a, b); -} - -// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) -{ - return (__m128)vcleq_f32(a, b); -} - -// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) -{ - return (__m128)vceqq_f32(a, b); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcltq_s32(a, b); -} - -FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) -{ - return (__m128i) vceqq_s32(a,b); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcgtq_s32(a, b); -} - -// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx -// see also: -// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean -// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics -FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) -{ - // Note: NEON does not have ordered compare builtin - // Need to compare a eq a and b eq b to check for NaN - // Do AND of results to get final - return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) ); -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx -FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcltq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx -FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcgtq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx -FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcleq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx -FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcgeq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx -FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vceqq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx -FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vceqq_f32(a, b); - return !vgetq_lane_u32(value, 0); -} - -// according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here. -#define _mm_ucomilt_ss _mm_comilt_ss -#define _mm_ucomile_ss _mm_comile_ss -#define _mm_ucomigt_ss _mm_comigt_ss -#define _mm_ucomige_ss _mm_comige_ss -#define _mm_ucomieq_ss _mm_comieq_ss -#define _mm_ucomineq_ss _mm_comineq_ss - -// ****************************************** -// Conversions -// ****************************************** - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) -{ - return vcvtq_s32_f32(a); -} - -// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) -{ - return vcvtq_f32_s32(a); -} - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx -// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support! -// It is supported on ARMv8 however. -FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) -{ -#if 1 - return vcvtnq_s32_f32(a); -#else - __m128 half = vdupq_n_f32(0.5f); - const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); - const __m128 aPlusHalf = vaddq_f32(a, half); - const __m128 aRound = vsubq_f32(aPlusHalf, sign); - return vcvtq_s32_f32(aRound); -#endif -} - -// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx -FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) -{ - return vgetq_lane_s32(a, 0); -} - -// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) -{ - __m128i result = vdupq_n_s32(0); - return vsetq_lane_s32(a, result, 0); -} - - -// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx -FORCE_INLINE __m128i _mm_castps_si128(__m128 a) -{ -#if defined(__aarch64__) - return (__m128i)a; -#else - return *(const __m128i *)&a; -#endif -} - -// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx -FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) -{ -#if defined(__aarch64__) - return (__m128)a; -#else - return *(const __m128 *)&a; -#endif -} - -// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx -FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) -{ - return vld1q_s32((int32_t *)p); -} - -FORCE_INLINE __m128d _mm_castps_pd(const __m128 a) -{ - return *(const __m128d *)&a; -} - -FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) -{ - return *(const __m128d *)&a; -} -// ****************************************** -// Miscellaneous Operations -// ****************************************** - -// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) -{ - return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b)); -} - -// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) -{ - return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); -} - -// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)); -} - -// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a); - int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b); - - int8x8x2_t result = vzip_s8(a1, b1); - - return (__m128i)vcombine_s8(result.val[0], result.val[1]); -} - -// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_low_s16((int16x8_t)a); - int16x4_t b1 = vget_low_s16((int16x8_t)b); - - int16x4x2_t result = vzip_s16(a1, b1); - - return (__m128i)vcombine_s16(result.val[0], result.val[1]); -} - -// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_low_s32(a); - int32x2_t b1 = vget_low_s32(b); - - int32x2x2_t result = vzip_s32(a1, b1); - - return vcombine_s32(result.val[0], result.val[1]); -} - -// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) -{ - float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b)); - return vcombine_f32(result.val[0], result.val[1]); -} - -// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) -{ - float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b)); - return vcombine_f32(result.val[0], result.val[1]); -} - -// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a); - int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b); - - int8x8x2_t result = vzip_s8(a1, b1); - - return (__m128i)vcombine_s8(result.val[0], result.val[1]); -} - -// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_high_s16((int16x8_t)a); - int16x4_t b1 = vget_high_s16((int16x8_t)b); - - int16x4x2_t result = vzip_s16(a1, b1); - - return (__m128i)vcombine_s16(result.val[0], result.val[1]); -} - -// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_high_s32(a); - int32x2_t b1 = vget_high_s32(b); - - int32x2x2_t result = vzip_s32(a1, b1); - - return vcombine_s32(result.val[0], result.val[1]); -} - -// Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx -#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm) - -// ****************************************** -// Streaming Extensions -// ****************************************** - -// Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx -FORCE_INLINE void _mm_sfence(void) -{ - __sync_synchronize(); -} - -// Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx -FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) -{ - *p = a; -} - -// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx -FORCE_INLINE void _mm_clflush(void const*p) -{ - // no corollary for Neon? -} - -FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b) -{ - // Stick to the flipped behavior of x86. - int64_t __attribute__((aligned(16))) data[2] = { b, a }; - return (__m128i)vld1q_s64(data); -} - -FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) -{ - return (__m128i)vmovq_n_s64(_i); -} - -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) -{ - int32x4_t mask = vshrq_n_s32(__m128i(c),31); - return vbslq_f32( uint32x4_t(mask), b, a); -} - -FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr) -{ - uint8x8_t t0 = vld1_u8((uint8_t*)ptr); - uint16x8_t t1 = vmovl_u8(t0); - uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); - return vreinterpretq_s32_u32(t2); -} - -FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr) -{ - uint16x8_t t0 = vld1q_u16((uint16_t*)ptr); - uint32x4_t t1 = vmovl_u16(vget_low_u16(t0)); - return vreinterpretq_s32_u32(t1); -} - -FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr) -{ - int8x8_t t0 = vld1_s8((int8_t*)ptr); - int16x8_t t1 = vmovl_s8(t0); - int32x4_t t2 = vmovl_s16(vget_low_s16(t1)); - float32x4_t t3 = vcvtq_f32_s32(t2); - return vreinterpretq_s32_f32(t3); -} - -FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr) -{ - uint8x8_t t0 = vld1_u8((uint8_t*)ptr); - uint16x8_t t1 = vmovl_u8(t0); - uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); - return vreinterpretq_s32_u32(t2); -} - -FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr) -{ - int16x8_t t0 = vld1q_s16((int16_t*)ptr); - int32x4_t t1 = vmovl_s16(vget_low_s16(t0)); - float32x4_t t2 = vcvtq_f32_s32(t1); - return vreinterpretq_s32_f32(t2); -} - -FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); -} - -FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr) -{ - // No non-temporal load on a single register on ARM. - return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr)); -} - -FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a) -{ - // No non-temporal store on a single register on ARM. - vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a)); -} - -FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); -} - -FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); -} - -FORCE_INLINE __m128 _mm_abs_ps(__m128 a) -{ - return vabsq_f32(a); -} - -FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c) -{ - return vmlaq_f32(c, a, b); -} - -FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c) -{ - return vmlsq_f32(c, a, b); -} - -FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) -{ - return vabsq_s32(a); -} -#endif //defined(__aarch64__) - -// Count the number of bits set to 1 in unsigned 32-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 -FORCE_INLINE int _mm_popcnt_u32(unsigned int a) -{ - return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a))); -} - -// Count the number of bits set to 1 in unsigned 64-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 -FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) -{ - return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a))); -} - -#endif diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h deleted file mode 100644 index 32452fbe72..0000000000 --- a/thirdparty/embree-aarch64/common/math/affinespace.h +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "linearspace2.h" -#include "linearspace3.h" -#include "quaternion.h" -#include "bbox.h" -#include "vec4.h" - -namespace embree -{ - #define VectorT typename L::Vector - #define ScalarT typename L::Vector::Scalar - - //////////////////////////////////////////////////////////////////////////////// - // Affine Space - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> - struct AffineSpaceT - { - L l; /*< linear part of affine space */ - VectorT p; /*< affine part of affine space */ - - //////////////////////////////////////////////////////////////////////////////// - // Constructors, Assignment, Cast, Copy Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline AffineSpaceT ( ) { } - __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } - __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } - __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } - - __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} - __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} - - template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {} - - //////////////////////////////////////////////////////////////////////////////// - // Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} - __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} - - /*! return matrix for scaling */ - static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } - - /*! return matrix for translation */ - static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } - - /*! return matrix for rotation, only in 2D */ - static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } - - /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ - static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } - - /*! return matrix for rotation around arbitrary axis and point, only in 3D */ - static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } - - /*! return matrix for looking at given point, only in 3D */ - static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { - VectorT Z = normalize(point-eye); - VectorT U = normalize(cross(up,Z)); - VectorT V = normalize(cross(Z,U)); - return AffineSpaceT(L(U,V,Z),eye); - } - - }; - - // template specialization to get correct identity matrix for type AffineSpace3fa - template<> - __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); } - template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); } - template<typename L> __forceinline AffineSpaceT<L> rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); } - template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); } - - template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); } - template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); } - template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); } - template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT & b ) { return a * rcp(b); } - - template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; } - template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a * b; } - template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; } - template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a / b; } - - template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } - template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); } - template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); } - - __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) - { - BBox3fa dst = empty; - const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); - const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); - const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); - const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); - const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); - const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); - const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); - const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); - return dst; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; } - template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) { - return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p)); - } - - //////////////////////////////////////////////////////////////////////////////// - // Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) { - return cout << "{ l = " << m.l << ", p = " << m.p << " }"; - } - - //////////////////////////////////////////////////////////////////////////////// - // Template Instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef AffineSpaceT<LinearSpace2f> AffineSpace2f; - typedef AffineSpaceT<LinearSpace3f> AffineSpace3f; - typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa; - typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx; - typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff; - typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f; - - template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>; - typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>> AffineSpace3vf4; - typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>> AffineSpace3vf8; - typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16; - - template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>; - typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>> AffineSpace3vfa4; - typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>> AffineSpace3vfa8; - typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16; - - ////////////////////////////////////////////////////////////////////////////// - /// Interpolation - ////////////////////////////////////////////////////////////////////////////// - template<typename T, typename R> - __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0, - const AffineSpaceT<T>& M1, - const R& t) - { - return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); - } - - // slerp interprets the 16 floats of the matrix M = D * R * S as components of - // three matrizes (D, R, S) that are interpolated individually. - template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>> - slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0, - const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1, - const T& t) - { - QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); - QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); - QuaternionT<T> q = slerp(q0, q1, t); - - AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t); - AffineSpaceT<LinearSpace3<Vec3<T>>> D(one); - D.p.x = S.l.vx.y; - D.p.y = S.l.vx.z; - D.p.z = S.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - - AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q); - return D * R * S; - } - - // this is a specialized version for Vec3fa because that does - // not play along nicely with the other templated Vec3/Vec4 types - __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, - const AffineSpace3ff& M1, - const float& t) - { - Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); - Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); - Quaternion3f q = slerp(q0, q1, t); - - AffineSpace3fa S = lerp(M0, M1, t); - AffineSpace3fa D(one); - D.p.x = S.l.vx.y; - D.p.y = S.l.vx.z; - D.p.z = S.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - - AffineSpace3fa R = LinearSpace3fa(q); - return D * R * S; - } - - __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) - { - // compute affine transform from quaternion decomposition - Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); - AffineSpace3fa M = qd; - AffineSpace3fa D(one); - D.p.x = M.l.vx.y; - D.p.y = M.l.vx.z; - D.p.z = M.l.vy.z; - M.l.vx.y = 0; - M.l.vx.z = 0; - M.l.vy.z = 0; - AffineSpace3fa R = LinearSpace3fa(q); - return D * R * M; - } - - __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) - { - q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); - S = qd; - T.x = qd.l.vx.y; - T.y = qd.l.vx.z; - T.z = qd.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - } - - __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) - { - AffineSpace3ff M = S; - M.l.vx.w = q.i; - M.l.vy.w = q.j; - M.l.vz.w = q.k; - M.p.w = q.r; - M.l.vx.y = T.x; - M.l.vx.z = T.y; - M.l.vy.z = T.z; - return M; - } - - struct __aligned(16) QuaternionDecomposition - { - float scale_x = 1.f; - float scale_y = 1.f; - float scale_z = 1.f; - float skew_xy = 0.f; - float skew_xz = 0.f; - float skew_yz = 0.f; - float shift_x = 0.f; - float shift_y = 0.f; - float shift_z = 0.f; - float quaternion_r = 1.f; - float quaternion_i = 0.f; - float quaternion_j = 0.f; - float quaternion_k = 0.f; - float translation_x = 0.f; - float translation_y = 0.f; - float translation_z = 0.f; - }; - - __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) - { - QuaternionDecomposition qd; - qd.scale_x = M.l.vx.x; - qd.scale_y = M.l.vy.y; - qd.scale_z = M.l.vz.z; - qd.shift_x = M.p.x; - qd.shift_y = M.p.y; - qd.shift_z = M.p.z; - qd.translation_x = M.l.vx.y; - qd.translation_y = M.l.vx.z; - qd.translation_z = M.l.vy.z; - qd.skew_xy = M.l.vy.x; - qd.skew_xz = M.l.vz.x; - qd.skew_yz = M.l.vz.y; - qd.quaternion_r = M.p.w; - qd.quaternion_i = M.l.vx.w; - qd.quaternion_j = M.l.vy.w; - qd.quaternion_k = M.l.vz.w; - return qd; - } - - //////////////////////////////////////////////////////////////////////////////// - /* - * ! Template Specialization for 2D: return matrix for rotation around point - * (rotation around arbitrarty vector is not meaningful in 2D) - */ - template<> __forceinline - AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { - return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); - } - - //////////////////////////////////////////////////////////////////////////////// - // Similarity Transform - // - // checks, if M is a similarity transformation, i.e if there exists a factor D - // such that for all x,y: distance(Mx, My) = D * distance(x, y) - //////////////////////////////////////////////////////////////////////////////// - __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) - { - if (D) *D = 0.f; - if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; - if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; - if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; - - const float D_x = dot(M.l.vx, M.l.vx); - const float D_y = dot(M.l.vy, M.l.vy); - const float D_z = dot(M.l.vz, M.l.vz); - - if (abs(D_x - D_y) > 1e-5f || - abs(D_x - D_z) > 1e-5f || - abs(D_y - D_z) > 1e-5f) - return false; - - if (D) *D = sqrtf(D_x); - return true; - } - - __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) - { - Vec3fa::storeu(&ptr->l.vx, source.l.vx); - Vec3fa::storeu(&ptr->l.vy, source.l.vy); - Vec3fa::storeu(&ptr->l.vz, source.l.vz); - Vec3fa::storeu(&ptr->p, source.p); - } - - __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) - { - AffineSpace3fa space; - space.l.vx = Vec3fa::loadu(&ptr->l.vx); - space.l.vy = Vec3fa::loadu(&ptr->l.vy); - space.l.vz = Vec3fa::loadu(&ptr->l.vz); - space.p = Vec3fa::loadu(&ptr->p); - return space; - } - - #undef VectorT - #undef ScalarT -} diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h deleted file mode 100644 index 29bb13912b..0000000000 --- a/thirdparty/embree-aarch64/common/math/bbox.h +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" -#include "vec3.h" - -namespace embree -{ - namespace internal { - - template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); } - template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; } - template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; } - - } // namespace internal - template<typename T> - struct BBox - { - T lower, upper; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline BBox ( ) { } - template<typename T1> - __forceinline BBox ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {} - __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } - - __forceinline BBox ( const T& v ) : lower(v), upper(v) {} - __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Extending Bounds - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } - __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } - - /*! tests if box is empty */ - __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; } - - /*! computes the size of the box */ - __forceinline T size() const { return upper - lower; } - - /*! computes the center of the box */ - __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); } - - /*! computes twice the center of the box */ - __forceinline T center2() const { return lower+upper; } - - /*! merges two boxes */ - __forceinline static const BBox merge (const BBox& a, const BBox& b) { - return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); - } - - /*! enlarge box by some scaling factor */ - __forceinline BBox enlarge_by(const float a) const { - return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} - __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} - __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} - }; - - template<> __forceinline bool BBox<float>::empty() const { - return lower > upper; - } - -#if defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline bool BBox<Vec3fa>::empty() const { - return !all(le_mask(lower,upper)); - } - template<> __forceinline bool BBox<Vec3fx>::empty() const { - return !all(le_mask(lower,upper)); - } -#endif - - /*! tests if box is finite */ - __forceinline bool isvalid( const BBox<Vec3fa>& v ) { - return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); - } - - /*! tests if box is finite and non-empty*/ - __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) { - return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); - } - - /*! tests if box has finite entries */ - __forceinline bool is_finite( const BBox<Vec3fa>& b) { - return is_finite(b.lower) && is_finite(b.upper); - } - - /*! test if point contained in box */ - __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } - - /*! computes the center of the box */ - template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; } - template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); } - - /*! computes the volume of a bounding box */ - __forceinline float volume ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); } - __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); } - - /*! computes the volume of a bounding box */ - __forceinline float volume( const BBox<Vec3f>& b ) { return reduce_mul(b.size()); } - - /*! computes the surface area of a bounding box */ - template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; } - - template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); } - template<typename T> __forceinline const T area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); } - - __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); } - __forceinline float area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); } - - __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); } - __forceinline float area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); } - - template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); } - - template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) { - return halfArea(box); - } - - /*! merges bounding boxes and points */ - template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const T& b ) { return BBox<T>(min(a.lower, b ), max(a.upper, b )); } - template<typename T> __forceinline const BBox<T> merge( const T& a, const BBox<T>& b ) { return BBox<T>(min(a , b.lower), max(a , b.upper)); } - template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); } - - /*! Merges three boxes. */ - template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); } - - /*! Merges four boxes. */ - template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) { - return merge(merge(a,b),merge(c,d)); - } - - /*! Comparison Operators */ - template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; } - template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; } - - /*! scaling */ - template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } - template<typename T> __forceinline BBox<T> operator *( const T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } - - /*! translations */ - template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); } - template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); } - template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower+b ,a.upper+b ); } - template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower-b ,a.upper-b ); } - - /*! extension */ - template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); } - - /*! intersect bounding boxes */ - template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); } - template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); } - template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); } - - /*! subtract bounds from each other */ - template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d) - { - c.lower = a.lower; - c.upper = min(a.upper,b.lower); - d.lower = max(a.lower,b.upper); - d.upper = a.upper; - } - - /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ - template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); } - template<typename T> __inline bool disjoint( const BBox<T>& a, const T& b ) { return disjoint(a,BBox<T>(b)); } - template<typename T> __inline bool disjoint( const T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); } - - /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ - template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); } - template<typename T> __inline bool conjoint( const BBox<T>& a, const T& b ) { return conjoint(a,BBox<T>(b)); } - template<typename T> __inline bool conjoint( const T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); } - - /*! subset relation */ - template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b ) - { - for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; - for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; - return true; - } - - template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); - } - - template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); - } - - /*! blending */ - template<typename T> - __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) { - return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); - } - - /*! output operator */ - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) { - return cout << "[" << box.lower << "; " << box.upper << "]"; - } - - /*! default template instantiations */ - typedef BBox<float> BBox1f; - typedef BBox<Vec2f> BBox2f; - typedef BBox<Vec2fa> BBox2fa; - typedef BBox<Vec3f> BBox3f; - typedef BBox<Vec3fa> BBox3fa; - typedef BBox<Vec3fx> BBox3fx; - typedef BBox<Vec3ff> BBox3ff; -} - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined (__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<int N> - __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds); - - template<> - __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds) - { - BBox<Vec3<vfloat4>> dest; - - transpose((vfloat4&)bounds[0].lower, - (vfloat4&)bounds[1].lower, - (vfloat4&)bounds[2].lower, - (vfloat4&)bounds[3].lower, - dest.lower.x, - dest.lower.y, - dest.lower.z); - - transpose((vfloat4&)bounds[0].upper, - (vfloat4&)bounds[1].upper, - (vfloat4&)bounds[2].upper, - (vfloat4&)bounds[3].upper, - dest.upper.x, - dest.upper.y, - dest.upper.z); - - return dest; - } - -#if defined(__AVX__) - template<> - __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds) - { - BBox<Vec3<vfloat8>> dest; - - transpose((vfloat4&)bounds[0].lower, - (vfloat4&)bounds[1].lower, - (vfloat4&)bounds[2].lower, - (vfloat4&)bounds[3].lower, - (vfloat4&)bounds[4].lower, - (vfloat4&)bounds[5].lower, - (vfloat4&)bounds[6].lower, - (vfloat4&)bounds[7].lower, - dest.lower.x, - dest.lower.y, - dest.lower.z); - - transpose((vfloat4&)bounds[0].upper, - (vfloat4&)bounds[1].upper, - (vfloat4&)bounds[2].upper, - (vfloat4&)bounds[3].upper, - (vfloat4&)bounds[4].upper, - (vfloat4&)bounds[5].upper, - (vfloat4&)bounds[6].upper, - (vfloat4&)bounds[7].upper, - dest.upper.x, - dest.upper.y, - dest.upper.z); - - return dest; - } -#endif - - template<int N> - __forceinline BBox3fa merge(const BBox3fa* bounds); - - template<> - __forceinline BBox3fa merge<4>(const BBox3fa* bounds) - { - const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), - min(bounds[2].lower,bounds[3].lower)); - const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), - max(bounds[2].upper,bounds[3].upper)); - return BBox3fa(lower,upper); - } - -#if defined(__AVX__) - template<> - __forceinline BBox3fa merge<8>(const BBox3fa* bounds) - { - const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), - min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); - const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), - max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); - return BBox3fa(lower,upper); - } -#endif -} - diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h deleted file mode 100644 index f52015fb88..0000000000 --- a/thirdparty/embree-aarch64/common/math/col3.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// RGB Color Class - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct Col3 - { - T r, g, b; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col3 ( ) { } - __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } - __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } - - __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} - __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} - __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} - __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} - __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} - }; - - /*! output operator */ - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; - } - - /*! default template instantiations */ - typedef Col3<uint8_t > Col3uc; - typedef Col3<float > Col3f; -} diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h deleted file mode 100644 index 90df293f8e..0000000000 --- a/thirdparty/embree-aarch64/common/math/col4.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// RGBA Color Class - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct Col4 - { - T r, g, b, a; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col4 ( ) { } - __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } - __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } - - __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} - __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} - __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} - __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} - __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} - }; - - /*! output operator */ - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; - } - - /*! default template instantiations */ - typedef Col4<uint8_t > Col4uc; - typedef Col4<float > Col4f; -} diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h deleted file mode 100644 index c3083e4fc0..0000000000 --- a/thirdparty/embree-aarch64/common/math/color.h +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "constants.h" -#include "col3.h" -#include "col4.h" - -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE RGBA Color Class - //////////////////////////////////////////////////////////////////////////////// - - struct Color4 - { - union { - __m128 m128; - struct { float r,g,b,a; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color4 () {} - __forceinline Color4 ( const __m128 a ) : m128(a) {} - - __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} - __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} - - __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } - __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } - __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } - __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } - - __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} - __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Set - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } - __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } - __forceinline void set(Col3uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - } - __forceinline void set(Col4uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - d.a = (uint8_t)(s[3]); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} - __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - }; - - //////////////////////////////////////////////////////////////////////////////// - /// SSE RGB Color Class - //////////////////////////////////////////////////////////////////////////////// - - struct Color - { - union { - __m128 m128; - struct { float r,g,b; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color () {} - __forceinline Color ( const __m128 a ) : m128(a) {} - - __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} - __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} - - __forceinline Color ( const Color& other ) : m128(other.m128) {} - __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } - - __forceinline Color ( const Color4& other ) : m128(other.m128) {} - __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Set - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } - __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } - __forceinline void set(Col3uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - } - __forceinline void set(Col4uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - d.a = 255; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} - __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator +( const Color& a ) { return a; } - __forceinline const Color operator -( const Color& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline const Color abs ( const Color& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline const Color rcp ( const Color& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Color)reciprocal; -#else -#if defined(__AVX512VL__) - const Color r = _mm_rcp14_ps(a.m128); -#else - const Color r = _mm_rcp_ps(a.m128); -#endif - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif //defined(__aarch64__) && defined(BUILD_IOS) - } - __forceinline const Color rsqrt( const Color& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - -#endif //defined(__aarch64__) && defined(BUILD_IOS) - } - __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } - __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } - __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } - __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } - - __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } - __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } - __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } - __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } - __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } - __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } - __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } - __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } - __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - __forceinline bool operator < ( const Color& a, const Color& b ) { - if (a.r != b.r) return a.r < b.r; - if (a.g != b.g) return a.g < b.g; - if (a.b != b.b) return a.b < b.b; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color select( bool s, const Color& t, const Color& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Special Operators - //////////////////////////////////////////////////////////////////////////////// - - /*! computes luminance of a color */ - __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } - - /*! output operator */ - __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp deleted file mode 100644 index eeff131664..0000000000 --- a/thirdparty/embree-aarch64/common/math/constants.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#if defined(__aarch64__) -#include <arm_neon.h> -#endif - -#include "constants.h" - -namespace embree -{ - TrueTy True; - FalseTy False; - ZeroTy zero; - OneTy one; - NegInfTy neg_inf; - PosInfTy inf; - PosInfTy pos_inf; - NaNTy nan; - UlpTy ulp; - PiTy pi; - OneOverPiTy one_over_pi; - TwoPiTy two_pi; - OneOverTwoPiTy one_over_two_pi; - FourPiTy four_pi; - OneOverFourPiTy one_over_four_pi; - StepTy step; - ReverseStepTy reverse_step; - EmptyTy empty; - UndefinedTy undefined; - -#if defined(__aarch64__) -const uint32x4_t movemask_mask = { 1, 2, 4, 8 }; -const uint32x4_t vzero = { 0, 0, 0, 0 }; -const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; -const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }; -const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; -const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; -const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11}; -const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15}; -const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; -const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f }; -const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f }; -const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY }; -const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY }; -#endif - -} diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h deleted file mode 100644 index e80abec80f..0000000000 --- a/thirdparty/embree-aarch64/common/math/constants.h +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" - -#include <limits> - -#define _USE_MATH_DEFINES -#include <math.h> // using cmath causes issues under Windows -#include <cfloat> -#include <climits> - -// Math constants may not be defined in libcxx + mingw + strict C++ standard -#if defined(__MINGW32__) - -// TODO(LTE): use constexpr -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif -#ifndef M_1_PI -#define M_1_PI 0.31830988618379067154 -#endif - -#endif // __MINGW32__ - -namespace embree -{ - static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; - static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail - - /* we consider floating point numbers in that range as valid input numbers */ - static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; - - struct TrueTy { - __forceinline operator bool( ) const { return true; } - }; - - extern MAYBE_UNUSED TrueTy True; - - struct FalseTy { - __forceinline operator bool( ) const { return false; } - }; - - extern MAYBE_UNUSED FalseTy False; - - struct ZeroTy - { - __forceinline operator double ( ) const { return 0; } - __forceinline operator float ( ) const { return 0; } - __forceinline operator long long( ) const { return 0; } - __forceinline operator unsigned long long( ) const { return 0; } - __forceinline operator long ( ) const { return 0; } - __forceinline operator unsigned long ( ) const { return 0; } - __forceinline operator int ( ) const { return 0; } - __forceinline operator unsigned int ( ) const { return 0; } - __forceinline operator short ( ) const { return 0; } - __forceinline operator unsigned short ( ) const { return 0; } - __forceinline operator int8_t ( ) const { return 0; } - __forceinline operator uint8_t ( ) const { return 0; } - }; - - extern MAYBE_UNUSED ZeroTy zero; - - struct OneTy - { - __forceinline operator double ( ) const { return 1; } - __forceinline operator float ( ) const { return 1; } - __forceinline operator long long( ) const { return 1; } - __forceinline operator unsigned long long( ) const { return 1; } - __forceinline operator long ( ) const { return 1; } - __forceinline operator unsigned long ( ) const { return 1; } - __forceinline operator int ( ) const { return 1; } - __forceinline operator unsigned int ( ) const { return 1; } - __forceinline operator short ( ) const { return 1; } - __forceinline operator unsigned short ( ) const { return 1; } - __forceinline operator int8_t ( ) const { return 1; } - __forceinline operator uint8_t ( ) const { return 1; } - }; - - extern MAYBE_UNUSED OneTy one; - - struct NegInfTy - { - __forceinline operator double ( ) const { return -std::numeric_limits<double>::infinity(); } - __forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); } - __forceinline operator long long( ) const { return std::numeric_limits<long long>::min(); } - __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); } - __forceinline operator long ( ) const { return std::numeric_limits<long>::min(); } - __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::min(); } - __forceinline operator int ( ) const { return std::numeric_limits<int>::min(); } - __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); } - __forceinline operator short ( ) const { return std::numeric_limits<short>::min(); } - __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); } - __forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::min(); } - __forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::min(); } - - }; - - extern MAYBE_UNUSED NegInfTy neg_inf; - - struct PosInfTy - { - __forceinline operator double ( ) const { return std::numeric_limits<double>::infinity(); } - __forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); } - __forceinline operator long long( ) const { return std::numeric_limits<long long>::max(); } - __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); } - __forceinline operator long ( ) const { return std::numeric_limits<long>::max(); } - __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::max(); } - __forceinline operator int ( ) const { return std::numeric_limits<int>::max(); } - __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); } - __forceinline operator short ( ) const { return std::numeric_limits<short>::max(); } - __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); } - __forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::max(); } - __forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::max(); } - }; - - extern MAYBE_UNUSED PosInfTy inf; - extern MAYBE_UNUSED PosInfTy pos_inf; - - struct NaNTy - { - __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); } - __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); } - }; - - extern MAYBE_UNUSED NaNTy nan; - - struct UlpTy - { - __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); } - __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); } - }; - - extern MAYBE_UNUSED UlpTy ulp; - - struct PiTy - { - __forceinline operator double( ) const { return double(M_PI); } - __forceinline operator float ( ) const { return float(M_PI); } - }; - - extern MAYBE_UNUSED PiTy pi; - - struct OneOverPiTy - { - __forceinline operator double( ) const { return double(M_1_PI); } - __forceinline operator float ( ) const { return float(M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverPiTy one_over_pi; - - struct TwoPiTy - { - __forceinline operator double( ) const { return double(2.0*M_PI); } - __forceinline operator float ( ) const { return float(2.0*M_PI); } - }; - - extern MAYBE_UNUSED TwoPiTy two_pi; - - struct OneOverTwoPiTy - { - __forceinline operator double( ) const { return double(0.5*M_1_PI); } - __forceinline operator float ( ) const { return float(0.5*M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; - - struct FourPiTy - { - __forceinline operator double( ) const { return double(4.0*M_PI); } - __forceinline operator float ( ) const { return float(4.0*M_PI); } - }; - - extern MAYBE_UNUSED FourPiTy four_pi; - - struct OneOverFourPiTy - { - __forceinline operator double( ) const { return double(0.25*M_1_PI); } - __forceinline operator float ( ) const { return float(0.25*M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; - - struct StepTy { - }; - - extern MAYBE_UNUSED StepTy step; - - struct ReverseStepTy { - }; - - extern MAYBE_UNUSED ReverseStepTy reverse_step; - - struct EmptyTy { - }; - - extern MAYBE_UNUSED EmptyTy empty; - - struct FullTy { - }; - - extern MAYBE_UNUSED FullTy full; - - struct UndefinedTy { - }; - - extern MAYBE_UNUSED UndefinedTy undefined; - -#if defined(__aarch64__) - extern const uint32x4_t movemask_mask; - extern const uint32x4_t vzero; - extern const uint32x4_t v0x80000000; - extern const uint32x4_t v0x7fffffff; - extern const uint32x4_t v000F; - extern const uint32x4_t v00F0; - extern const uint32x4_t v00FF; - extern const uint32x4_t v0F00; - extern const uint32x4_t v0F0F; - extern const uint32x4_t v0FF0; - extern const uint32x4_t v0FFF; - extern const uint32x4_t vF000; - extern const uint32x4_t vF00F; - extern const uint32x4_t vF0F0; - extern const uint32x4_t vF0FF; - extern const uint32x4_t vFF00; - extern const uint32x4_t vFF0F; - extern const uint32x4_t vFFF0; - extern const uint32x4_t vFFFF; - extern const uint8x16_t v0022; - extern const uint8x16_t v1133; - extern const uint8x16_t v0101; - extern const float32x4_t vOne; - extern const float32x4_t vmOne; - extern const float32x4_t vInf; - extern const float32x4_t vmInf; -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h deleted file mode 100644 index f06478e881..0000000000 --- a/thirdparty/embree-aarch64/common/math/interval.h +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" -#include "vec3.h" -#include "bbox.h" - -namespace embree -{ - template<typename V> - struct Interval - { - V lower, upper; - - __forceinline Interval() {} - __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } - __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } - - __forceinline Interval(const V& a) : lower(a), upper(a) {} - __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} - __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {} - - /*! tests if box is empty */ - //__forceinline bool empty() const { return lower > upper; } - - /*! computes the size of the interval */ - __forceinline V size() const { return upper - lower; } - - __forceinline V center() const { return 0.5f*(lower+upper); } - - __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } - __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } - - __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { - return Interval(a.lower+b.lower,a.upper+b.upper); - } - - __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { - return Interval(a.lower-b.upper,a.upper-b.lower); - } - - __forceinline friend Interval operator -( const Interval& a, const V& b ) { - return Interval(a.lower-b,a.upper-b); - } - - __forceinline friend Interval operator *( const Interval& a, const Interval& b ) - { - const V ll = a.lower*b.lower; - const V lu = a.lower*b.upper; - const V ul = a.upper*b.lower; - const V uu = a.upper*b.upper; - return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b) { - return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { - return merge(merge(a,b),c); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { - return merge(merge(a,b),merge(c,d)); - } - - /*! intersect bounding boxes */ - __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } - __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } - __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } - - friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { - return cout << "[" << a.lower << ", " << a.upper << "]"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} - }; - - __forceinline bool isEmpty(const Interval<float>& v) { - return v.lower > v.upper; - } - - __forceinline vboolx isEmpty(const Interval<vfloatx>& v) { - return v.lower > v.upper; - } - - /*! subset relation */ - template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { - return (a.lower > b.lower) && (a.upper < b.upper); - } - - template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { - return subset(a.x,b.x) && subset(a.y,b.y); - } - - template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { - return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) { - return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); - } - - template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) { - return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); - } - - __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1) - { - float eps = 1E-4f; - bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; - bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; - return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); - } - - typedef Interval<float> Interval1f; - typedef Vec2<Interval<float>> Interval2f; - typedef Vec3<Interval<float>> Interval3f; - -inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } - -inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } - -#define TWO_PI (2.0*M_PI) -inline Interval1f sin(Interval1f interval) -{ - if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } - if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } - if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } - float sinLower = sin(interval.lower); - float sinUpper = sin(interval.upper); - if (sinLower > sinUpper) swap(sinLower, sinUpper); - if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; - if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; - return Interval1f(sinLower, sinUpper); -} - -inline Interval1f cos(Interval1f interval) -{ - if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } - if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } - if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } - float cosLower = cos(interval.lower); - float cosUpper = cos(interval.upper); - if (cosLower > cosUpper) swap(cosLower, cosUpper); - if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; - return Interval1f(cosLower, cosUpper); -} -#undef TWO_PI -} diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h deleted file mode 100644 index 95df4a918d..0000000000 --- a/thirdparty/embree-aarch64/common/math/lbbox.h +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bbox.h" -#include "range.h" - -namespace embree -{ - template<typename T> - __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt) - { - const float rcp_dt_size = float(1.0f)/dt.size(); - const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); - const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); - return std::make_pair(g0,g1); - } - - template<typename T> - struct LBBox - { - public: - __forceinline LBBox () {} - - template<typename T1> - __forceinline LBBox ( const LBBox<T1>& other ) - : bounds0(other.bounds0), bounds1(other.bounds1) {} - - __forceinline LBBox& operator= ( const LBBox& other ) { - bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; - } - - __forceinline LBBox (EmptyTy) - : bounds0(EmptyTy()), bounds1(EmptyTy()) {} - - __forceinline explicit LBBox ( const BBox<T>& bounds) - : bounds0(bounds), bounds1(bounds) { } - - __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) - : bounds0(bounds0), bounds1(bounds1) { } - - LBBox ( const avector<BBox<T>>& bounds ) - { - assert(bounds.size()); - BBox<T> b0 = bounds.front(); - BBox<T> b1 = bounds.back(); - for (size_t i=1; i<bounds.size()-1; i++) { - const float f = float(i)/float(bounds.size()-1); - const BBox<T> bt = lerp(b0,b1,f); - const T dlower = min(bounds[i].lower-bt.lower,T(zero)); - const T dupper = max(bounds[i].upper-bt.upper,T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template<typename BoundsFunc> - __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) - { - const float lower = time_range.lower*numTimeSegments; - const float upper = time_range.upper*numTimeSegments; - const float ilowerf = floor(lower); - const float iupperf = ceil(upper); - const int ilower = (int)ilowerf; - const int iupper = (int)iupperf; - - const BBox<T> blower0 = bounds(ilower); - const BBox<T> bupper1 = bounds(iupper); - - if (iupper-ilower == 1) { - bounds0 = lerp(blower0, bupper1, lower-ilowerf); - bounds1 = lerp(bupper1, blower0, iupperf-upper); - return; - } - - const BBox<T> blower1 = bounds(ilower+1); - const BBox<T> bupper0 = bounds(iupper-1); - BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf); - BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper); - - for (int i = ilower+1; i < iupper; i++) - { - const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); - const BBox<T> bt = lerp(b0, b1, f); - const BBox<T> bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template<typename BoundsFunc> - __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) - { - /* normalize global time_range_in to local geom_time_range */ - const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), - (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); - - const float lower = time_range.lower*geom_time_segments; - const float upper = time_range.upper*geom_time_segments; - const float ilowerf = floor(lower); - const float iupperf = ceil(upper); - const float ilowerfc = max(0.0f,ilowerf); - const float iupperfc = min(iupperf,geom_time_segments); - const int ilowerc = (int)ilowerfc; - const int iupperc = (int)iupperfc; - assert(iupperc-ilowerc > 0); - - /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ - const int ilower_iter = max(-1,(int)ilowerf); - const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); - - const BBox<T> blower0 = bounds(ilowerc); - const BBox<T> bupper1 = bounds(iupperc); - if (iupper_iter-ilower_iter == 1) { - bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); - bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); - return; - } - - const BBox<T> blower1 = bounds(ilowerc+1); - const BBox<T> bupper0 = bounds(iupperc-1); - BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); - BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); - - for (int i = ilower_iter+1; i < iupper_iter; i++) - { - const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); - const BBox<T> bt = lerp(b0, b1, f); - const BBox<T> bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template<typename BoundsFunc> - __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments) - { - const int ilower = time_range.begin(); - const int iupper = time_range.end(); - - BBox<T> b0 = bounds(ilower); - BBox<T> b1 = bounds(iupper); - - if (iupper-ilower == 1) - { - bounds0 = b0; - bounds1 = b1; - return; - } - - for (int i = ilower+1; i<iupper; i++) - { - const float f = float(i - time_range.begin()) / float(time_range.size()); - const BBox<T> bt = lerp(b0, b1, f); - const BBox<T> bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - public: - - __forceinline bool empty() const { - return bounds().empty(); - } - - __forceinline BBox<T> bounds () const { - return merge(bounds0,bounds1); - } - - __forceinline BBox<T> interpolate( const float t ) const { - return lerp(bounds0,bounds1,t); - } - - __forceinline LBBox<T> interpolate( const BBox1f& dt ) const { - return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper)); - } - - __forceinline void extend( const LBBox& other ) { - bounds0.extend(other.bounds0); - bounds1.extend(other.bounds1); - } - - __forceinline float expectedHalfArea() const; - - __forceinline float expectedHalfArea(const BBox1f& dt) const { - return interpolate(dt).expectedHalfArea(); - } - - __forceinline float expectedApproxHalfArea() const { - return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); - } - - /* calculates bounds for [0,1] time range from bounds in dt time range */ - __forceinline LBBox global(const BBox1f& dt) const - { - const float rcp_dt_size = 1.0f/dt.size(); - const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size); - const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); - return LBBox(b0,b1); - } - - /*! Comparison Operators */ - //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } - //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } - friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } - friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { - return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; - } - - public: - BBox<T> bounds0, bounds1; - }; - - /*! tests if box is finite */ - template<typename T> - __forceinline bool isvalid( const LBBox<T>& v ) { - return isvalid(v.bounds0) && isvalid(v.bounds1); - } - - template<typename T> - __forceinline bool isvalid_non_empty( const LBBox<T>& v ) { - return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); - } - - template<typename T> - __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) - { - const T da = a1-a0; - const T db = b1-b0; - return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); - } - - template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const - { - const Vec3fa d0 = bounds0.size(); - const Vec3fa d1 = bounds1.size(); - return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), - Vec3fa(d1.x,d1.y,d1.z), - Vec3fa(d0.y,d0.z,d0.x), - Vec3fa(d1.y,d1.z,d1.x))); - } - - template<typename T> - __forceinline float expectedApproxHalfArea(const LBBox<T>& box) { - return box.expectedApproxHalfArea(); - } - - template<typename T> - __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) { - return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); - } - - /*! subset relation */ - template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) { - return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); - } - - /*! default template instantiations */ - typedef LBBox<float> LBBox1f; - typedef LBBox<Vec2f> LBBox2f; - typedef LBBox<Vec3f> LBBox3f; - typedef LBBox<Vec3fa> LBBox3fa; - typedef LBBox<Vec3fx> LBBox3fx; -} diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h deleted file mode 100644 index b9a382962c..0000000000 --- a/thirdparty/embree-aarch64/common/math/linearspace2.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// 2D Linear Transform (2x2 Matrix) - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct LinearSpace2 - { - typedef T Vector; - typedef typename T::Scalar Scalar; - - /*! default matrix constructor */ - __forceinline LinearSpace2 ( ) {} - __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } - __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } - - template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {} - - /*! matrix construction from column vectors */ - __forceinline LinearSpace2(const Vector& vx, const Vector& vy) - : vx(vx), vy(vy) {} - - /*! matrix construction from row mayor data */ - __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, - const Scalar& m10, const Scalar& m11) - : vx(m00,m10), vy(m01,m11) {} - - /*! compute the determinant of the matrix */ - __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } - - /*! compute adjoint matrix */ - __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } - - /*! compute inverse matrix */ - __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } - - /*! compute transposed matrix */ - __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } - - /*! returns first row of matrix */ - __forceinline Vector row0() const { return Vector(vx.x,vy.x); } - - /*! returns second row of matrix */ - __forceinline Vector row1() const { return Vector(vx.y,vy.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} - __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} - - /*! return matrix for scaling */ - static __forceinline LinearSpace2 scale(const Vector& s) { - return LinearSpace2(s.x, 0, - 0 , s.y); - } - - /*! return matrix for rotation */ - static __forceinline LinearSpace2 rotate(const Scalar& r) { - Scalar s = sin(r), c = cos(r); - return LinearSpace2(c, -s, - s, c); - } - - /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ - LinearSpace2 orthogonal() const - { - LinearSpace2 m = *this; - - // mirrored? - Scalar mirror(one); - if (m.det() < Scalar(zero)) { - m.vx = -m.vx; - mirror = -mirror; - } - - // rotation - for (int i = 0; i < 99; i++) { - const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); - const LinearSpace2 d = m_next - m; - m = m_next; - // norm^2 of difference small enough? - if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) - break; - } - - // rotation * mirror_x - return LinearSpace2(mirror*m.vx, m.vy); - } - - public: - - /*! the column vectors of the matrix */ - Vector vx,vy; - }; - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); } - template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); } - template<typename T> __forceinline LinearSpace2<T> rcp ( const LinearSpace2<T>& a ) { return a.inverse(); } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); } - template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); } - - template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } - template<typename T> __forceinline T operator*(const LinearSpace2<T>& a, const T & b) { return b.x*a.vx + b.y*a.vy; } - template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } - - template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); } - template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); } - - template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; } - template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; } - template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) { - return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; - } - - /*! Shortcuts for common linear spaces. */ - typedef LinearSpace2<Vec2f> LinearSpace2f; - typedef LinearSpace2<Vec2fa> LinearSpace2fa; -} diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h deleted file mode 100644 index 12b5bb776b..0000000000 --- a/thirdparty/embree-aarch64/common/math/linearspace3.h +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec3.h" -#include "quaternion.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// 3D Linear Transform (3x3 Matrix) - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct LinearSpace3 - { - typedef T Vector; - typedef typename T::Scalar Scalar; - - /*! default matrix constructor */ - __forceinline LinearSpace3 ( ) {} - __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } - __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } - - template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} - - /*! matrix construction from column vectors */ - __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) - : vx(vx), vy(vy), vz(vz) {} - - /*! construction from quaternion */ - __forceinline LinearSpace3( const QuaternionT<Scalar>& q ) - : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) - , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) - , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} - - /*! matrix construction from row mayor data */ - __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, - const Scalar& m10, const Scalar& m11, const Scalar& m12, - const Scalar& m20, const Scalar& m21, const Scalar& m22) - : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} - - /*! compute the determinant of the matrix */ - __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } - - /*! compute adjoint matrix */ - __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } - - /*! compute inverse matrix */ - __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } - - /*! compute transposed matrix */ - __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } - - /*! returns first row of matrix */ - __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } - - /*! returns second row of matrix */ - __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } - - /*! returns third row of matrix */ - __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} - __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} - - /*! return matrix for scaling */ - static __forceinline LinearSpace3 scale(const Vector& s) { - return LinearSpace3(s.x, 0, 0, - 0 , s.y, 0, - 0 , 0, s.z); - } - - /*! return matrix for rotation around arbitrary axis */ - static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { - Vector u = normalize(_u); - Scalar s = sin(r), c = cos(r); - return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, - u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, - u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); - } - - public: - - /*! the column vectors of the matrix */ - Vector vx,vy,vz; - }; - - /*! compute transposed matrix */ - template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { - vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); - return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); - } - - template<typename T> - __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { - return xfm.transposed(); - } - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); } - template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); } - template<typename T> __forceinline LinearSpace3<T> rcp ( const LinearSpace3<T>& a ) { return a.inverse(); } - - /* constructs a coordinate frame form a normalized normal */ - template<typename T> __forceinline LinearSpace3<T> frame(const T& N) - { - const T dx0(0,N.z,-N.y); - const T dx1(-N.z,0,N.x); - const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); - const T dy = normalize(cross(N,dx)); - return LinearSpace3<T>(dx,dy,N); - } - - /* constructs a coordinate frame from a normal and approximate x-direction */ - template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi) - { - if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel - const T dx = normalize(cross(dxi,N)); - const T dy = normalize(cross(N,dx)); - return LinearSpace3<T>(dx,dy,N); - } - - /* clamps linear space to range -1 to +1 */ - template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) { - return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)), - clamp(space.vy,T(-1.0f),T(1.0f)), - clamp(space.vz,T(-1.0f),T(1.0f))); - } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } - template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } - - template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } - template<typename T> __forceinline T operator*(const LinearSpace3<T>& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } - template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } - - template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); } - template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); } - - template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; } - template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; } - - template<typename T> __forceinline T xfmPoint (const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } - template<typename T> __forceinline T xfmVector(const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } - template<typename T> __forceinline T xfmNormal(const LinearSpace3<T>& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } - template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) { - return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); - } - - /*! blending */ - template<typename T> - __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) - { - return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), - lerp(l0.vy,l1.vy,t), - lerp(l0.vz,l1.vz,t)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) { - return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; - } - - /*! Shortcuts for common linear spaces. */ - typedef LinearSpace3<Vec3f> LinearSpace3f; - typedef LinearSpace3<Vec3fa> LinearSpace3fa; - typedef LinearSpace3<Vec3fx> LinearSpace3fx; - typedef LinearSpace3<Vec3ff> LinearSpace3ff; - - template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>; - typedef LinearSpace3<Vec3<vfloat<4>>> LinearSpace3vf4; - typedef LinearSpace3<Vec3<vfloat<8>>> LinearSpace3vf8; - typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16; - - /*! blending */ - template<typename T, typename S> - __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, - const LinearSpace3<T>& l1, - const S& t) - { - return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), - lerp(l0.vy,l1.vy,t), - lerp(l0.vz,l1.vz,t)); - } - -} diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h deleted file mode 100644 index 6d54abd44d..0000000000 --- a/thirdparty/embree-aarch64/common/math/math.h +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "constants.h" -#include <cmath> - -#if defined(__ARM_NEON) -#include "SSE2NEON.h" -#if defined(NEON_AVX2_EMULATION) -#include "AVX2NEON.h" -#endif -#else -#include <emmintrin.h> -#include <xmmintrin.h> -#include <immintrin.h> -#endif - -#if defined(__WIN32__) && !defined(__MINGW32__) -#if (__MSV_VER <= 1700) -namespace std -{ - __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } - __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } - __forceinline bool isfinite (const float x) { return _finite(x) != 0; } -} -#endif -#endif - -namespace embree -{ - __forceinline bool isvalid ( const float& v ) { - return (v > -FLT_LARGE) & (v < +FLT_LARGE); - } - - __forceinline int cast_f2i(float f) { - union { float f; int i; } v; v.f = f; return v.i; - } - - __forceinline float cast_i2f(int i) { - union { float f; int i; } v; v.i = i; return v.f; - } - - __forceinline int toInt (const float& a) { return int(a); } - __forceinline float toFloat(const int& a) { return float(a); } - -#if defined(__WIN32__) && !defined(__MINGW32__) - __forceinline bool finite ( const float x ) { return _finite(x) != 0; } -#endif - - __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } - __forceinline float sqr ( const float x ) { return x*x; } - - __forceinline float rcp ( const float x ) - { -#if defined(__aarch64__) - // Move scalar to vector register and do rcp. - __m128 a; - a[0] = x; - float32x4_t reciprocal = vrecpeq_f32(a); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - return reciprocal[0]; -#else - - const __m128 a = _mm_set_ss(x); - -#if defined(__AVX512VL__) - const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); -#else - const __m128 r = _mm_rcp_ss(a); -#endif - -#if defined(__AVX2__) - return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); -#else - return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); -#endif - -#endif //defined(__aarch64__) - } - - __forceinline float signmsk ( const float x ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128i b; - a[0] = x; - b[0] = 0x80000000; - a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); - return a[0]; -#else - return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#endif - } - __forceinline float xorf( const float x, const float y ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128 b; - a[0] = x; - b[0] = y; - a = _mm_xor_ps(a, b); - return a[0]; -#else - return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); -#endif - } - __forceinline float andf( const float x, const unsigned y ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128i b; - a[0] = x; - b[0] = y; - a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); - return a[0]; -#else - return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); -#endif - } - __forceinline float rsqrt( const float x ) - { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - a[0] = x; - __m128 value = _mm_rsqrt_ps(a); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); - return value[0]; -#else - - const __m128 a = _mm_set_ss(x); -#if defined(__AVX512VL__) - const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); -#else - const __m128 r = _mm_rsqrt_ss(a); -#endif - const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), - _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); - return _mm_cvtss_f32(c); -#endif - } - -#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__) - __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } - __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } - __forceinline int roundf(float f) { return (int)(f + 0.5f); } -#else - __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } - __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } -#endif - - __forceinline float abs ( const float x ) { return ::fabsf(x); } - __forceinline float acos ( const float x ) { return ::acosf (x); } - __forceinline float asin ( const float x ) { return ::asinf (x); } - __forceinline float atan ( const float x ) { return ::atanf (x); } - __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } - __forceinline float cos ( const float x ) { return ::cosf (x); } - __forceinline float cosh ( const float x ) { return ::coshf (x); } - __forceinline float exp ( const float x ) { return ::expf (x); } - __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } - __forceinline float log ( const float x ) { return ::logf (x); } - __forceinline float log10( const float x ) { return ::log10f(x); } - __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } - __forceinline float sin ( const float x ) { return ::sinf (x); } - __forceinline float sinh ( const float x ) { return ::sinhf (x); } - __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } - __forceinline float tan ( const float x ) { return ::tanf (x); } - __forceinline float tanh ( const float x ) { return ::tanhf (x); } - __forceinline float floor( const float x ) { return ::floorf (x); } - __forceinline float ceil ( const float x ) { return ::ceilf (x); } - __forceinline float frac ( const float x ) { return x-floor(x); } - - __forceinline double abs ( const double x ) { return ::fabs(x); } - __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } - __forceinline double acos ( const double x ) { return ::acos (x); } - __forceinline double asin ( const double x ) { return ::asin (x); } - __forceinline double atan ( const double x ) { return ::atan (x); } - __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } - __forceinline double cos ( const double x ) { return ::cos (x); } - __forceinline double cosh ( const double x ) { return ::cosh (x); } - __forceinline double exp ( const double x ) { return ::exp (x); } - __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } - __forceinline double log ( const double x ) { return ::log (x); } - __forceinline double log10( const double x ) { return ::log10(x); } - __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } - __forceinline double rcp ( const double x ) { return 1.0/x; } - __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } - __forceinline double sin ( const double x ) { return ::sin (x); } - __forceinline double sinh ( const double x ) { return ::sinh (x); } - __forceinline double sqr ( const double x ) { return x*x; } - __forceinline double sqrt ( const double x ) { return ::sqrt (x); } - __forceinline double tan ( const double x ) { return ::tan (x); } - __forceinline double tanh ( const double x ) { return ::tanh (x); } - __forceinline double floor( const double x ) { return ::floor (x); } - __forceinline double ceil ( const double x ) { return ::ceil (x); } - -#if defined(__aarch64__) - __forceinline float mini(float a, float b) { - // FP and Neon shares same vector register in arm64 - __m128 x; - __m128 y; - x[0] = a; - y[0] = b; - x = _mm_min_ps(x, y); - return x[0]; - } -#elif defined(__SSE4_1__) - __forceinline float mini(float a, float b) { - const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); - const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); - const __m128i ci = _mm_min_epi32(ai,bi); - return _mm_cvtss_f32(_mm_castsi128_ps(ci)); - } -#endif - -#if defined(__aarch64__) - __forceinline float maxi(float a, float b) { - // FP and Neon shares same vector register in arm64 - __m128 x; - __m128 y; - x[0] = a; - y[0] = b; - x = _mm_max_ps(x, y); - return x[0]; - } -#elif defined(__SSE4_1__) - __forceinline float maxi(float a, float b) { - const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); - const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); - const __m128i ci = _mm_max_epi32(ai,bi); - return _mm_cvtss_f32(_mm_castsi128_ps(ci)); - } -#endif - - template<typename T> - __forceinline T twice(const T& a) { return a+a; } - - __forceinline int min(int a, int b) { return a<b ? a:b; } - __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; } - __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } - __forceinline float min(float a, float b) { return a<b ? a:b; } - __forceinline double min(double a, double b) { return a<b ? a:b; } -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } -#endif - - template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } - template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } - template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } - - template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } - template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } - template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } - - __forceinline int max(int a, int b) { return a<b ? b:a; } - __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; } - __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } - __forceinline float max(float a, float b) { return a<b ? b:a; } - __forceinline double max(double a, double b) { return a<b ? b:a; } -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } -#endif - - template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } - template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } - template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } - - template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } - template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } - template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } - -#if defined(__MACOSX__) - __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; } - __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; } -#endif - -#if defined(__MACOSX__) && !defined(__INTEL_COMPILER) - __forceinline void sincosf(float x, float *sin, float *cos) { - __sincosf(x,sin,cos); - } -#endif - -#if defined(__WIN32__) || defined(__FreeBSD__) - __forceinline void sincosf(float x, float *s, float *c) { - *s = sinf(x); *c = cosf(x); - } -#endif - - template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } - template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } - - template<typename T> __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } - template<typename T> __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } - template<typename T> __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } - template<typename T> __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } - -#if defined(__AVX2__) - __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } -#elif defined (__aarch64__) && defined(__clang__) -#pragma clang fp contract(fast) - - -__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } -__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } -__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } -__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } - -#pragma clang fp contract(on) -#else - __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } - __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } - __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} - __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } -#endif - - /*! random functions */ - template<typename T> T random() { return T(0); } -#if defined(_WIN32) - template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } - template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } -#else - template<> __forceinline int random() { return int(rand()); } - template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } -#endif - template<> __forceinline float random() { return rand()/float(RAND_MAX); } - template<> __forceinline double random() { return rand()/double(RAND_MAX); } - -#if _WIN32 - __forceinline double drand48() { - return double(rand())/double(RAND_MAX); - } - - __forceinline void srand48(long seed) { - return srand(seed); - } -#endif - - /*! selects */ - __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } - __forceinline int select(bool s, int t, int f) { return s ? t : f; } - __forceinline float select(bool s, float t, float f) { return s ? t : f; } - - __forceinline bool all(bool s) { return s; } - - __forceinline float lerp(const float v0, const float v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - template<typename T> - __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { - return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); - } - - /*! exchange */ - template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } - - - template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) { -#if 1//!defined(__aarch64__) - return msub(a,b,c*d); -#else - return nmadd(c,d,a*b); -#endif - } - - /*! bit reverse operation */ - template<class T> - __forceinline T bitReverse(const T& vin) - { - T v = vin; - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - v = ( v >> 16 ) | ( v << 16); - return v; - } - - /*! bit interleave operation */ - template<class T> - __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) - { - T x = xin, y = yin, z = zin; - x = (x | (x << 16)) & 0x030000FF; - x = (x | (x << 8)) & 0x0300F00F; - x = (x | (x << 4)) & 0x030C30C3; - x = (x | (x << 2)) & 0x09249249; - - y = (y | (y << 16)) & 0x030000FF; - y = (y | (y << 8)) & 0x0300F00F; - y = (y | (y << 4)) & 0x030C30C3; - y = (y | (y << 2)) & 0x09249249; - - z = (z | (z << 16)) & 0x030000FF; - z = (z | (z << 8)) & 0x0300F00F; - z = (z | (z << 4)) & 0x030C30C3; - z = (z | (z << 2)) & 0x09249249; - - return x | (y << 1) | (z << 2); - } - -#if defined(__AVX2__) && !defined(__aarch64__) - - template<> - __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) - { - const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); - const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); - const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); - return xx | yy | zz; - } - -#endif - - /*! bit interleave operation for 64bit data types*/ - template<class T> - __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ - T x = xin & 0x1fffff; - T y = yin & 0x1fffff; - T z = zin & 0x1fffff; - - x = (x | x << 32) & 0x1f00000000ffff; - x = (x | x << 16) & 0x1f0000ff0000ff; - x = (x | x << 8) & 0x100f00f00f00f00f; - x = (x | x << 4) & 0x10c30c30c30c30c3; - x = (x | x << 2) & 0x1249249249249249; - - y = (y | y << 32) & 0x1f00000000ffff; - y = (y | y << 16) & 0x1f0000ff0000ff; - y = (y | y << 8) & 0x100f00f00f00f00f; - y = (y | y << 4) & 0x10c30c30c30c30c3; - y = (y | y << 2) & 0x1249249249249249; - - z = (z | z << 32) & 0x1f00000000ffff; - z = (z | z << 16) & 0x1f0000ff0000ff; - z = (z | z << 8) & 0x100f00f00f00f00f; - z = (z | z << 4) & 0x10c30c30c30c30c3; - z = (z | z << 2) & 0x1249249249249249; - - return x | (y << 1) | (z << 2); - } -} diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h deleted file mode 100644 index 032b56904e..0000000000 --- a/thirdparty/embree-aarch64/common/math/obbox.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bbox.h" -#include "linearspace3.h" - -namespace embree -{ - /*! Oriented bounding box */ - template<typename T> - struct OBBox - { - public: - - __forceinline OBBox () {} - - __forceinline OBBox (EmptyTy) - : space(one), bounds(empty) {} - - __forceinline OBBox (const BBox<T>& bounds) - : space(one), bounds(bounds) {} - - __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) - : space(space), bounds(bounds) {} - - friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { - return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; - } - - public: - LinearSpace3<T> space; //!< orthonormal transformation - BBox<T> bounds; //!< bounds in transformed space - }; - - typedef OBBox<Vec3f> OBBox3f; - typedef OBBox<Vec3fa> OBBox3fa; -} diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h deleted file mode 100644 index 20c69bc62f..0000000000 --- a/thirdparty/embree-aarch64/common/math/quaternion.h +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec3.h" -#include "vec4.h" - -#include "transcendental.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////// - // Quaternion Struct - //////////////////////////////////////////////////////////////// - - template<typename T> - struct QuaternionT - { - typedef Vec3<T> Vector; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline QuaternionT () { } - __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } - __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } - - __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} - __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} - __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} - __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} - __forceinline QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {} - - __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ); - __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} - __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} - - /*! return quaternion for rotation around arbitrary axis */ - static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) { - return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); - } - - /*! returns the rotation axis of the quaternion as a vector */ - __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); } - - public: - T r, i, j, k; - }; - - template<typename T> __forceinline QuaternionT<T> operator *( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); } - template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); } - - //////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////// - - template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); } - template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); } - template<typename T> __forceinline QuaternionT<T> conj ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); } - template<typename T> __forceinline T abs ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - template<typename T> __forceinline QuaternionT<T> rcp ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - - // evaluates a*q-r - template<typename T> __forceinline QuaternionT<T> - msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) - { - return QuaternionT<T>(msub(a, q.r, p.r), - msub(a, q.i, p.i), - msub(a, q.j, p.j), - msub(a, q.k, p.k)); - } - // evaluates a*q-r - template<typename T> __forceinline QuaternionT<T> - madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) - { - return QuaternionT<T>(madd(a, q.r, p.r), - madd(a, q.i, p.i), - madd(a, q.j, p.j), - madd(a, q.k, p.k)); - } - - //////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////// - - template<typename T> __forceinline QuaternionT<T> operator +( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r, b.i, b.j, b.k); } - template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); } - template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } - template<typename T> __forceinline QuaternionT<T> operator -( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); } - template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); } - template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } - - template<typename T> __forceinline Vec3<T> operator *( const QuaternionT<T>& a, const Vec3<T> & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } - template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) { - return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, - a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, - a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, - a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); - } - template<typename T> __forceinline QuaternionT<T> operator /( const T & a, const QuaternionT<T>& b ) { return a*rcp(b); } - template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T & b ) { return a*rcp(b); } - template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); } - - template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T & b ) { return a = a+b; } - template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; } - template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T & b ) { return a = a-b; } - template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; } - template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T & b ) { return a = a*b; } - template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; } - template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T & b ) { return a = a*rcp(b); } - template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); } - - template<typename T, typename M> __forceinline QuaternionT<T> - select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p) - { - return QuaternionT<T>(select(m, q.r, p.r), - select(m, q.i, p.i), - select(m, q.j, p.j), - select(m, q.k, p.k)); - } - - - template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } - template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } - template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } - - template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } - template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Orientation Functions - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ) - { - if ( vx.x + vy.y + vz.z >= T(zero) ) - { - const T t = T(one) + (vx.x + vy.y + vz.z); - const T s = rsqrt(t)*T(0.5f); - r = t*s; - i = (vy.z - vz.y)*s; - j = (vz.x - vx.z)*s; - k = (vx.y - vy.x)*s; - } - else if ( vx.x >= max(vy.y, vz.z) ) - { - const T t = (T(one) + vx.x) - (vy.y + vz.z); - const T s = rsqrt(t)*T(0.5f); - r = (vy.z - vz.y)*s; - i = t*s; - j = (vx.y + vy.x)*s; - k = (vz.x + vx.z)*s; - } - else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) - { - const T t = (T(one) + vy.y) - (vz.z + vx.x); - const T s = rsqrt(t)*T(0.5f); - r = (vz.x - vx.z)*s; - i = (vx.y + vy.x)*s; - j = t*s; - k = (vy.z + vz.y)*s; - } - else //if ( vz.z >= max(vy.y, vx.x) ) - { - const T t = (T(one) + vz.z) - (vx.x + vy.y); - const T s = rsqrt(t)*T(0.5f); - r = (vx.y - vy.x)*s; - i = (vz.x + vx.z)*s; - j = (vy.z + vz.y)*s; - k = t*s; - } - } - - template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll ) - { - const T cya = cos(yaw *T(0.5f)); - const T cpi = cos(pitch*T(0.5f)); - const T cro = cos(roll *T(0.5f)); - const T sya = sin(yaw *T(0.5f)); - const T spi = sin(pitch*T(0.5f)); - const T sro = sin(roll *T(0.5f)); - r = cro*cya*cpi + sro*sya*spi; - i = cro*cya*spi + sro*sya*cpi; - j = cro*sya*cpi - sro*cya*spi; - k = sro*cya*cpi - cro*sya*spi; - } - - ////////////////////////////////////////////////////////////////////////////// - /// Output Operators - ////////////////////////////////////////////////////////////////////////////// - - template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) { - return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; - } - - /*! default template instantiations */ - typedef QuaternionT<float> Quaternion3f; - typedef QuaternionT<double> Quaternion3d; - - template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>; - typedef QuaternionT<vfloat<4>> Quaternion3vf4; - typedef QuaternionT<vfloat<8>> Quaternion3vf8; - typedef QuaternionT<vfloat<16>> Quaternion3vf16; - - ////////////////////////////////////////////////////////////////////////////// - /// Interpolation - ////////////////////////////////////////////////////////////////////////////// - template<typename T> - __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0, - const QuaternionT<T>& q1, - const T& factor) - { - QuaternionT<T> q; - q.r = lerp(q0.r, q1.r, factor); - q.i = lerp(q0.i, q1.i, factor); - q.j = lerp(q0.j, q1.j, factor); - q.k = lerp(q0.k, q1.k, factor); - return q; - } - - template<typename T> - __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0, - const QuaternionT<T>& q1_, - const T& t) - { - T cosTheta = dot(q0, q1_); - QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_); - cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); - if (unlikely(all(cosTheta > 0.9995f))) { - return normalize(lerp(q0, q1, t)); - } - const T phi = t * fastapprox::acos(cosTheta); - T sinPhi, cosPhi; - fastapprox::sincos(phi, sinPhi, cosPhi); - QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); - return msub(cosPhi, q0, qperp); - } -} diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h deleted file mode 100644 index 762d9cd9ea..0000000000 --- a/thirdparty/embree-aarch64/common/math/range.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../math/math.h" - -namespace embree -{ - template<typename Ty> - struct range - { - __forceinline range() {} - - __forceinline range(const Ty& begin) - : _begin(begin), _end(begin+1) {} - - __forceinline range(const Ty& begin, const Ty& end) - : _begin(begin), _end(end) {} - - __forceinline range(const range& other) - : _begin(other._begin), _end(other._end) {} - - template<typename T1> - __forceinline range(const range<T1>& other) - : _begin(Ty(other._begin)), _end(Ty(other._end)) {} - - template<typename T1> - __forceinline range& operator =(const range<T1>& other) { - _begin = other._begin; - _end = other._end; - return *this; - } - - __forceinline Ty begin() const { - return _begin; - } - - __forceinline Ty end() const { - return _end; - } - - __forceinline range intersect(const range& r) const { - return range (max(_begin,r._begin),min(_end,r._end)); - } - - __forceinline Ty size() const { - return _end - _begin; - } - - __forceinline bool empty() const { - return _end <= _begin; - } - - __forceinline Ty center() const { - return (_begin + _end)/2; - } - - __forceinline std::pair<range,range> split() const - { - const Ty _center = center(); - return std::make_pair(range(_begin,_center),range(_center,_end)); - } - - __forceinline void split(range& left_o, range& right_o) const - { - const Ty _center = center(); - left_o = range(_begin,_center); - right_o = range(_center,_end); - } - - __forceinline friend bool operator< (const range& r0, const range& r1) { - return r0.size() < r1.size(); - } - - friend embree_ostream operator<<(embree_ostream cout, const range& r) { - return cout << "range [" << r.begin() << ", " << r.end() << "]"; - } - - Ty _begin, _end; - }; - - template<typename Ty> - range<Ty> make_range(const Ty& begin, const Ty& end) { - return range<Ty>(begin,end); - } - - template<typename Ty> - struct extended_range : public range<Ty> - { - __forceinline extended_range () {} - - __forceinline extended_range (const Ty& begin) - : range<Ty>(begin), _ext_end(begin+1) {} - - __forceinline extended_range (const Ty& begin, const Ty& end) - : range<Ty>(begin,end), _ext_end(end) {} - - __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) - : range<Ty>(begin,end), _ext_end(ext_end) {} - - __forceinline Ty ext_end() const { - return _ext_end; - } - - __forceinline Ty ext_size() const { - return _ext_end - range<Ty>::_begin; - } - - __forceinline Ty ext_range_size() const { - return _ext_end - range<Ty>::_end; - } - - __forceinline bool has_ext_range() const { - assert(_ext_end >= range<Ty>::_end); - return (_ext_end - range<Ty>::_end) > 0; - } - - __forceinline void set_ext_range(const size_t ext_end){ - assert(ext_end >= range<Ty>::_end); - _ext_end = ext_end; - } - - __forceinline void move_right(const size_t plus){ - range<Ty>::_begin += plus; - range<Ty>::_end += plus; - _ext_end += plus; - } - - friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { - return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; - } - - Ty _ext_end; - }; -} diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h deleted file mode 100644 index 6855d82b53..0000000000 --- a/thirdparty/embree-aarch64/common/math/transcendental.h +++ /dev/null @@ -1,525 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -// Transcendental functions from "ispc": https://github.com/ispc/ispc/ -// Most of the transcendental implementations in ispc code come from -// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ - -#include "../simd/simd.h" - -namespace embree -{ - -namespace fastapprox -{ - -template <typename T> -__forceinline T sin(const T &v) -{ - static const float piOverTwoVec = 1.57079637050628662109375; - static const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - auto kMod4 = k & 3; - auto sinUseCos = (kMod4 == 1 | kMod4 == 3); - auto flipSign = (kMod4 > 1); - - // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, - // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); - static const float sinC2 = -0.16666667163372039794921875; - static const float sinC4 = +8.333347737789154052734375e-3; - static const float sinC6 = -1.9842604524455964565277099609375e-4; - static const float sinC8 = +2.760012648650445044040679931640625e-6; - static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - static const float cosC2 = -0.5; - static const float cosC4 = +4.166664183139801025390625e-2; - static const float cosC6 = -1.388833043165504932403564453125e-3; - static const float cosC8 = +2.47562347794882953166961669921875e-5; - static const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto outside = select(sinUseCos, 1., x); - auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); - auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); - auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); - auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); - auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); - - auto x2 = x * x; - auto formula = x2 * c10 + c8; - formula = x2 * formula + c6; - formula = x2 * formula + c4; - formula = x2 * formula + c2; - formula = x2 * formula + 1.; - formula *= outside; - - formula = select(flipSign, -formula, formula); - return formula; -} - -template <typename T> -__forceinline T cos(const T &v) -{ - static const float piOverTwoVec = 1.57079637050628662109375; - static const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - - auto kMod4 = k & 3; - auto cosUseCos = (kMod4 == 0 | kMod4 == 2); - auto flipSign = (kMod4 == 1 | kMod4 == 2); - - const float sinC2 = -0.16666667163372039794921875; - const float sinC4 = +8.333347737789154052734375e-3; - const float sinC6 = -1.9842604524455964565277099609375e-4; - const float sinC8 = +2.760012648650445044040679931640625e-6; - const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - const float cosC2 = -0.5; - const float cosC4 = +4.166664183139801025390625e-2; - const float cosC6 = -1.388833043165504932403564453125e-3; - const float cosC8 = +2.47562347794882953166961669921875e-5; - const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto outside = select(cosUseCos, 1., x); - auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); - auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); - auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); - auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); - auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); - - auto x2 = x * x; - auto formula = x2 * c10 + c8; - formula = x2 * formula + c6; - formula = x2 * formula + c4; - formula = x2 * formula + c2; - formula = x2 * formula + 1.; - formula *= outside; - - formula = select(flipSign, -formula, formula); - return formula; -} - -template <typename T> -__forceinline void sincos(const T &v, T &sinResult, T &cosResult) -{ - const float piOverTwoVec = 1.57079637050628662109375; - const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - auto kMod4 = k & 3; - auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); - auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); - auto sinFlipSign = (kMod4 > 1); - auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); - - const float oneVec = +1.; - const float sinC2 = -0.16666667163372039794921875; - const float sinC4 = +8.333347737789154052734375e-3; - const float sinC6 = -1.9842604524455964565277099609375e-4; - const float sinC8 = +2.760012648650445044040679931640625e-6; - const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - const float cosC2 = -0.5; - const float cosC4 = +4.166664183139801025390625e-2; - const float cosC6 = -1.388833043165504932403564453125e-3; - const float cosC8 = +2.47562347794882953166961669921875e-5; - const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto x2 = x * x; - - auto sinFormula = x2 * sinC10 + sinC8; - auto cosFormula = x2 * cosC10 + cosC8; - sinFormula = x2 * sinFormula + sinC6; - cosFormula = x2 * cosFormula + cosC6; - - sinFormula = x2 * sinFormula + sinC4; - cosFormula = x2 * cosFormula + cosC4; - - sinFormula = x2 * sinFormula + sinC2; - cosFormula = x2 * cosFormula + cosC2; - - sinFormula = x2 * sinFormula + oneVec; - cosFormula = x2 * cosFormula + oneVec; - - sinFormula *= x; - - sinResult = select(sinUseCos, cosFormula, sinFormula); - cosResult = select(cosUseCos, cosFormula, sinFormula); - - sinResult = select(sinFlipSign, -sinResult, sinResult); - cosResult = select(cosFlipSign, -cosResult, cosResult); -} - -template <typename T> -__forceinline T tan(const T &v) -{ - const float piOverFourVec = 0.785398185253143310546875; - const float fourOverPiVec = 1.27323949337005615234375; - - auto xLt0 = v < 0.; - auto y = select(xLt0, -v, v); - auto scaled = y * fourOverPiVec; - - auto kReal = floor(scaled); - auto k = toInt(kReal); - - auto x = y - kReal * piOverFourVec; - - // If k & 1, x -= Pi/4 - auto needOffset = (k & 1) != 0; - x = select(needOffset, x - piOverFourVec, x); - - // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... - auto kMod4 = k & 3; - auto useCotan = (kMod4 == 1) | (kMod4 == 2); - - const float oneVec = 1.0; - - const float tanC2 = +0.33333075046539306640625; - const float tanC4 = +0.13339905440807342529296875; - const float tanC6 = +5.3348250687122344970703125e-2; - const float tanC8 = +2.46033705770969390869140625e-2; - const float tanC10 = +2.892402000725269317626953125e-3; - const float tanC12 = +9.500005282461643218994140625e-3; - - const float cotC2 = -0.3333333432674407958984375; - const float cotC4 = -2.222204394638538360595703125e-2; - const float cotC6 = -2.11752182804048061370849609375e-3; - const float cotC8 = -2.0846328698098659515380859375e-4; - const float cotC10 = -2.548247357481159269809722900390625e-5; - const float cotC12 = -3.5257363606433500535786151885986328125e-7; - - auto x2 = x * x; - T z; - if (any(useCotan)) - { - auto cotVal = x2 * cotC12 + cotC10; - cotVal = x2 * cotVal + cotC8; - cotVal = x2 * cotVal + cotC6; - cotVal = x2 * cotVal + cotC4; - cotVal = x2 * cotVal + cotC2; - cotVal = x2 * cotVal + oneVec; - // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. - cotVal /= -x; - z = cotVal; - } - auto useTan = !useCotan; - if (any(useTan)) - { - auto tanVal = x2 * tanC12 + tanC10; - tanVal = x2 * tanVal + tanC8; - tanVal = x2 * tanVal + tanC6; - tanVal = x2 * tanVal + tanC4; - tanVal = x2 * tanVal + tanC2; - tanVal = x2 * tanVal + oneVec; - // Equation was for tan(x)/x - tanVal *= x; - z = select(useTan, tanVal, z); - } - return select(xLt0, -z, z); -} - -template <typename T> -__forceinline T asin(const T &x0) -{ - auto isneg = (x0 < 0.f); - auto x = abs(x0); - auto isnan = (x > 1.f); - - // sollya - // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], - // [1e-20;.9999999999999999]); - // avg error: 1.1105439e-06, max error 1.3187528e-06 - auto v = 1.57079517841339111328125f + - x * (-0.21450997889041900634765625f + - x * (8.78556668758392333984375e-2f + - x * (-4.489909112453460693359375e-2f + - x * (1.928029954433441162109375e-2f + - x * (-4.3095736764371395111083984375e-3f))))); - - v *= -sqrt(1.f - x); - v = v + 1.57079637050628662109375f; - - v = select(v < 0.f, T(0.f), v); - v = select(isneg, -v, v); - v = select(isnan, T(cast_i2f(0x7fc00000)), v); - - return v; -} - -template <typename T> -__forceinline T acos(const T &v) -{ - return 1.57079637050628662109375f - asin(v); -} - -template <typename T> -__forceinline T atan(const T &v) -{ - const float piOverTwoVec = 1.57079637050628662109375; - // atan(-x) = -atan(x) (so flip from negative to positive first) - // If x > 1 -> atan(x) = Pi/2 - atan(1/x) - auto xNeg = v < 0.f; - auto xFlipped = select(xNeg, -v, v); - - auto xGt1 = xFlipped > 1.; - auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); - - // These coefficients approximate atan(x)/x - const float atanC0 = +0.99999988079071044921875; - const float atanC2 = -0.3333191573619842529296875; - const float atanC4 = +0.199689209461212158203125; - const float atanC6 = -0.14015688002109527587890625; - const float atanC8 = +9.905083477497100830078125e-2; - const float atanC10 = -5.93664981424808502197265625e-2; - const float atanC12 = +2.417283318936824798583984375e-2; - const float atanC14 = -4.6721356920897960662841796875e-3; - - auto x2 = x * x; - auto result = x2 * atanC14 + atanC12; - result = x2 * result + atanC10; - result = x2 * result + atanC8; - result = x2 * result + atanC6; - result = x2 * result + atanC4; - result = x2 * result + atanC2; - result = x2 * result + atanC0; - result *= x; - - result = select(xGt1, piOverTwoVec - result, result); - result = select(xNeg, -result, result); - return result; -} - -template <typename T> -__forceinline T atan2(const T &y, const T &x) -{ - const float piVec = 3.1415926536; - // atan2(y, x) = - // - // atan2(y > 0, x = +-0) -> Pi/2 - // atan2(y < 0, x = +-0) -> -Pi/2 - // atan2(y = +-0, x < +0) -> +-Pi - // atan2(y = +-0, x >= +0) -> +-0 - // - // atan2(y >= 0, x < 0) -> Pi + atan(y/x) - // atan2(y < 0, x < 0) -> -Pi + atan(y/x) - // atan2(y, x > 0) -> atan(y/x) - // - // and then a bunch of code for dealing with infinities. - auto yOverX = y * rcpSafe(x); - auto atanArg = atan(yOverX); - auto xLt0 = x < 0.f; - auto yLt0 = y < 0.f; - auto offset = select(xLt0, - select(yLt0, T(-piVec), T(piVec)), 0.f); - return offset + atanArg; -} - -template <typename T> -__forceinline T exp(const T &v) -{ - const float ln2Part1 = 0.6931457519; - const float ln2Part2 = 1.4286067653e-6; - const float oneOverLn2 = 1.44269502162933349609375; - - auto scaled = v * oneOverLn2; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * ln2Part1; - x -= kReal * ln2Part2; - - // These coefficients are for e^x in [0, ln(2)] - const float one = 1.; - const float c2 = 0.4999999105930328369140625; - const float c3 = 0.166668415069580078125; - const float c4 = 4.16539050638675689697265625e-2; - const float c5 = 8.378830738365650177001953125e-3; - const float c6 = 1.304379315115511417388916015625e-3; - const float c7 = 2.7555381529964506626129150390625e-4; - - auto result = x * c7 + c6; - result = x * result + c5; - result = x * result + c4; - result = x * result + c3; - result = x * result + c2; - result = x * result + one; - result = x * result + one; - - // Compute 2^k (should differ for float and double, but I'll avoid - // it for now and just do floats) - const int fpbias = 127; - auto biasedN = k + fpbias; - auto overflow = kReal > fpbias; - // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) - // we've got underflow. -127 * ln(2) -> -88.02. So the most - // negative float input that doesn't result in zero is like -88. - auto underflow = kReal <= -fpbias; - const int infBits = 0x7f800000; - biasedN <<= 23; - // Reinterpret this thing as float - auto twoToTheN = asFloat(biasedN); - // Handle both doubles and floats (hopefully eliding the copy for float) - auto elemtype2n = twoToTheN; - result *= elemtype2n; - result = select(overflow, cast_i2f(infBits), result); - result = select(underflow, 0., result); - return result; -} - -// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n -// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). -template <typename T, typename R> -__forceinline void __rangeReduceLog(const T &input, - T &reduced, - R &exponent) -{ - auto intVersion = asInt(input); - // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM - // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 - // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 - // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 - // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF - - //const int exponentMask(0x7F800000) - static const int nonexponentMask = 0x807FFFFF; - - // We want the reduced version to have an exponent of -1 which is - // -1 + 127 after biasing or 126 - static const int exponentNeg1 = (126l << 23); - // NOTE(boulos): We don't need to mask anything out since we know - // the sign bit has to be 0. If it's 1, we need to return infinity/nan - // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). - auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] - - auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 - exponent = offsetExponent - 127; // get the real value - - // Blend the offset_exponent with the original input (do this in - // int for now, until I decide if float can have & and ¬) - auto blended = (intVersion & nonexponentMask) | (exponentNeg1); - reduced = asFloat(blended); -} - -template <typename T> struct ExponentType { }; -template <int N> struct ExponentType<vfloat<N>> { typedef vint<N> Ty; }; -template <> struct ExponentType<float> { typedef int Ty; }; - -template <typename T> -__forceinline T log(const T &v) -{ - T reduced; - typename ExponentType<T>::Ty exponent; - - const int nanBits = 0x7fc00000; - const int negInfBits = 0xFF800000; - const float nan = cast_i2f(nanBits); - const float negInf = cast_i2f(negInfBits); - auto useNan = v < 0.; - auto useInf = v == 0.; - auto exceptional = useNan | useInf; - const float one = 1.0; - - auto patched = select(exceptional, one, v); - __rangeReduceLog(patched, reduced, exponent); - - const float ln2 = 0.693147182464599609375; - - auto x1 = one - reduced; - const float c1 = +0.50000095367431640625; - const float c2 = +0.33326041698455810546875; - const float c3 = +0.2519190013408660888671875; - const float c4 = +0.17541764676570892333984375; - const float c5 = +0.3424419462680816650390625; - const float c6 = -0.599632322788238525390625; - const float c7 = +1.98442304134368896484375; - const float c8 = -2.4899270534515380859375; - const float c9 = +1.7491014003753662109375; - - auto result = x1 * c9 + c8; - result = x1 * result + c7; - result = x1 * result + c6; - result = x1 * result + c5; - result = x1 * result + c4; - result = x1 * result + c3; - result = x1 * result + c2; - result = x1 * result + c1; - result = x1 * result + one; - - // Equation was for -(ln(red)/(1-red)) - result *= -x1; - result += toFloat(exponent) * ln2; - - return select(exceptional, - select(useNan, T(nan), T(negInf)), - result); -} - -template <typename T> -__forceinline T pow(const T &x, const T &y) -{ - auto x1 = abs(x); - auto z = exp(y * log(x1)); - - // Handle special cases - const float twoOver23 = 8388608.0f; - auto yInt = y == round(y); - auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit - - // x == 0 - z = select(x == 0.0f, - select(y < 0.0f, T(inf) | signmsk(x), - select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); - - // x < 0 - auto xNegative = x < 0.0f; - if (any(xNegative)) - { - auto z1 = z | asFloat(yOddInt); - z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); - z = select(xNegative, z1, z); - } - - auto xFinite = isfinite(x); - auto yFinite = isfinite(y); - if (all(xFinite & yFinite)) - return z; - - // x finite and y infinite - z = select(andn(xFinite, yFinite), - select(x1 == 1.0f, 1.0f, - select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); - - // x infinite - z = select(xFinite, z, - select(y == 0.0f, 1.0f, - select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); - - return z; -} - -template <typename T> -__forceinline T pow(const T &x, float y) -{ - return pow(x, T(y)); -} - -} // namespace fastapprox - -} // namespace embree diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h deleted file mode 100644 index a619459e9c..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec2.h +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - struct Vec2fa; - - //////////////////////////////////////////////////////////////////////////////// - /// Generic 2D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct Vec2 - { - enum { N = 2 }; - union { - struct { T x, y; }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2( ) {} - __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} - __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} - - __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } - __forceinline Vec2( const Vec2fa& other ); - - template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {} - template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; } - - __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} - __forceinline Vec2( OneTy ) : x(one), y(one) {} - __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} - __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} - -#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler - __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } - __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); } - template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); } - template<typename T> __forceinline Vec2<T> abs ( const Vec2<T>& a ) { return Vec2<T>(abs (a.x), abs (a.y)); } - template<typename T> __forceinline Vec2<T> rcp ( const Vec2<T>& a ) { return Vec2<T>(rcp (a.x), rcp (a.y)); } - template<typename T> __forceinline Vec2<T> rsqrt ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); } - template<typename T> __forceinline Vec2<T> sqrt ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); } - template<typename T> __forceinline Vec2<T> frac ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); } - template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x + b , a.y + b ); } - template<typename T> __forceinline Vec2<T> operator +( const T& a, const Vec2<T>& b ) { return Vec2<T>(a + b.x, a + b.y); } - template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); } - template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x - b , a.y - b ); } - template<typename T> __forceinline Vec2<T> operator -( const T& a, const Vec2<T>& b ) { return Vec2<T>(a - b.x, a - b.y); } - template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); } - template<typename T> __forceinline Vec2<T> operator *( const T& a, const Vec2<T>& b ) { return Vec2<T>(a * b.x, a * b.y); } - template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x * b , a.y * b ); } - template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); } - template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x / b , a.y / b ); } - template<typename T> __forceinline Vec2<T> operator /( const T& a, const Vec2<T>& b ) { return Vec2<T>(a / b.x, a / b.y); } - - template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); } - template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T> madd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> msub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } - - template<typename T> __forceinline Vec2<T> madd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> msub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } - template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; } - template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; } - template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } - template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; } - template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; } - template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); } - template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; } - template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; } - template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) { - return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); } - template<typename T> __forceinline Vec2<T> cross ( const Vec2<T>& a ) { return Vec2<T>(-a.y,a.x); } - template<typename T> __forceinline T length ( const Vec2<T>& a ) { return sqrt(dot(a,a)); } - template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a ) { return a*rsqrt(dot(a,a)); } - template<typename T> __forceinline T distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); } - template<typename T> __forceinline T det ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; } - - template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) { - const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) { - return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); - } - - template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) { - return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); - } - - template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) { - return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); - } - - template<typename T> - __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) { - return madd(Vec2<T>(T(1.0f)-t),v0,t*v1); - } - - template<typename T> __forceinline int maxDim ( const Vec2<T>& a ) - { - const Vec2<T> b = abs(a); - if (b.x > b.y) return 0; - else return 1; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) { - return cout << "(" << a.x << ", " << a.y << ")"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Default template instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef Vec2<bool > Vec2b; - typedef Vec2<int > Vec2i; - typedef Vec2<float> Vec2f; -} - -#include "vec2fa.h" - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined(__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} - -#if defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif - -#if defined(__AVX__) - template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h deleted file mode 100644 index 451ecd556c..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec2fa.h +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec2fa Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec2fa - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 2 }; - union { - __m128 m128; - struct { float x,y,az,aw; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa( ) {} - __forceinline Vec2fa( const __m128 a ) : m128(a) {} - - __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } - __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } - - __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } - __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} - - __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec2fa load( const void* const a ) { - return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); - } - - static __forceinline Vec2fa loadu( const void* const a ) { - return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); - } - - static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { - _mm_storeu_ps((float*)ptr,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } - __forceinline Vec2fa operator -( const Vec2fa& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline Vec2fa abs ( const Vec2fa& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline Vec2fa sign ( const Vec2fa& a ) { - return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); - } - - __forceinline Vec2fa rcp ( const Vec2fa& a ) - { -#if defined(__aarch64__) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Vec2fa)reciprocal; -#else -#if defined(__AVX512VL__) - const Vec2fa r = _mm_rcp14_ps(a.m128); -#else - const Vec2fa r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); -#else - const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; -#endif //defined(__aarch64__) - } - - __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } - - __forceinline Vec2fa rsqrt( const Vec2fa& a ) - { -#if defined(__aarch64__) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - -#endif - } - - __forceinline Vec2fa zero_fix(const Vec2fa& a) { - return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec2fa rcp_safe(const Vec2fa& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec2fa log ( const Vec2fa& a ) { - return Vec2fa(logf(a.x),logf(a.y)); - } - - __forceinline Vec2fa exp ( const Vec2fa& a ) { - return Vec2fa(expf(a.x),expf(a.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } - __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } - __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { - return Vec2fa(powf(a.x,b),powf(a.y,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } - __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } - __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } - __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } -#else - __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } - __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } - __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} - __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } -#endif - - __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } - __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } - __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } - __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } - __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } - __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } - __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } - __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } - __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } - __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } - __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } - __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } - __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); - } -#else - __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec2fa cross ( const Vec2fa& a ) { - return Vec2fa(-a.y,a.x); - } - - __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } - __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f, t, mask); - } - - __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec2fa& a ) - { - const Vec2fa b = abs(a); - if (b.x > b.y) return 0; - else return 1; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) -__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } -__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } -//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } -#elif defined (__SSE4_1__) - //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } -#else - //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } - __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { - return cout << "(" << a.x << ", " << a.y << ")"; - } - - typedef Vec2fa Vec2fa_t; -} diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h deleted file mode 100644 index 1870321715..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3.h +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - struct Vec3fa; - - //////////////////////////////////////////////////////////////////////////////// - /// Generic 3D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct Vec3 - { - enum { N = 3 }; - - union { - struct { - T x, y, z; - }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3( ) {} - __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} - __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} - - __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } - __forceinline Vec3( const Vec3fa& other ); - - template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} - template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; } - - __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} - __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} - __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} - __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} - -#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler - __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } - __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); } - template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); } - template<typename T> __forceinline Vec3<T> abs ( const Vec3<T>& a ) { return Vec3<T>(abs (a.x), abs (a.y), abs (a.z)); } - template<typename T> __forceinline Vec3<T> rcp ( const Vec3<T>& a ) { return Vec3<T>(rcp (a.x), rcp (a.y), rcp (a.z)); } - template<typename T> __forceinline Vec3<T> rsqrt ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } - template<typename T> __forceinline Vec3<T> sqrt ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } - - template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a ) - { - return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x), - select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y), - select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z)); - } - template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); } - template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); } - template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); } - template<typename T> __forceinline Vec3<T> operator *( const T& a, const Vec3<T>& b ) { return Vec3<T>(a * b.x, a * b.y, a * b.z); } - template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x * b , a.y * b , a.z * b ); } - template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x / b , a.y / b , a.z / b ); } - template<typename T> __forceinline Vec3<T> operator /( const T& a, const Vec3<T>& b ) { return Vec3<T>(a / b.x, a / b.y, a / b.z); } - template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); } - - template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } - template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } - - template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); } - template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T> madd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } - template<typename T> __forceinline Vec3<T> msub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } - template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} - template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } - - template<typename T> __forceinline Vec3<T> madd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } - template<typename T> __forceinline Vec3<T> msub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } - template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} - template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } - template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } - template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } - template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } - template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; } - template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; } - template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); } - template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } - template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } - template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) { - return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) { - return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); - } - - template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) { - return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); - } - - template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) { - return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); - } - - template<typename T> - __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) { - return madd(Vec3<T>(T(1.0f)-t),v0,t*v1); - } - - template<typename T> __forceinline int maxDim ( const Vec3<T>& a ) - { - const Vec3<T> b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); } - template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); } - template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); } - template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); } - template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); } - template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); } - template<typename T> __forceinline T dot ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } - template<typename T> __forceinline T length ( const Vec3<T>& a ) { return sqrt(sqr(a)); } - template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); } - template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } - template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } - template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); } - template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) - { - const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; - const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; - const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); - const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); - const auto sx = abs(ab_x) < abs(bc_x); - const auto sy = abs(ab_y) < abs(bc_y); - const auto sz = abs(ab_z) < abs(bc_z); - return Vec3<T>(select(sx,cross_ab.x,cross_bc.x), - select(sy,cross_ab.y,cross_bc.y), - select(sz,cross_ab.z,cross_bc.z)); - } - - template<typename T> __forceinline T sum ( const Vec3<T>& a ) { return a.x+a.y+a.z; } - - template<typename T> __forceinline T halfArea ( const Vec3<T>& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - template<typename T> __forceinline T area ( const Vec3<T>& d ) { return 2.0f*halfArea(d); } - - template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) { - const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); - } - - template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1) - { - const Vec3<T> N = cross(P-Q0,Q1-Q0); - const Vec3<T> D = Q1-Q0; - return dot(N,N)*rcp(dot(D,D)); - } - - template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0) - { - const Vec3<T> N = cross(PmQ0,Q1mQ0); - const Vec3<T> D = Q1mQ0; - return dot(N,N)*rcp(dot(D,D)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - typedef Vec3<bool > Vec3b; - typedef Vec3<int > Vec3i; - typedef Vec3<float> Vec3f; -} - -#include "vec3ba.h" -#include "vec3ia.h" -#include "vec3fa.h" - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined(__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<typename Out, typename In> - __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) { - return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); - } - - template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } - -#if defined(__AVX__) - template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { - x = a.x; y = a.y; z = a.z; - } -#elif defined(__SSE__) || defined(__ARM_NEON) - template<> - __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { - const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); - } -#endif - -#if defined(__SSE__) || defined(__ARM_NEON) - __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) { - return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - - template<> - __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { - return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - - template<int i0, int i1, int i2, int i3> - __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) { - return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); - } -#endif - -#if defined(__AVX__) - template<> - __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) { - x = a.x; y = a.y; z = a.z; - } - __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat8>& a, const size_t k) { - return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat4>& a, const size_t k) { - return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat8>& a, const size_t k) { - return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - - template<> - __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { - return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - template<> - __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) { - return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - - template<int i0, int i1, int i2, int i3> - __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) { - return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); - } -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h deleted file mode 100644 index 90f31739c2..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3ba.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3ba Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3ba - { - ALIGNED_STRUCT_(16); - - union { - __m128 m128; - struct { int x,y,z; }; - }; - - typedef int Scalar; - enum { N = 3 }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba( ) {} - __forceinline Vec3ba( const __m128 input ) : m128(input) {} - __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} - __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3ba( bool a ) - : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} - __forceinline Vec3ba( bool a, bool b, bool c) - : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } - __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } - __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } - __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } - __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { - return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; - } - __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { - return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; - } - __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } - __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } - - __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } - __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } - __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } - - __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { - return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h deleted file mode 100644 index 6163cfb596..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3fa.h +++ /dev/null @@ -1,810 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3fa Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3fa - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 3 }; - union { - __m128 m128; - struct { float x,y,z; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa( ) {} - __forceinline Vec3fa( const __m128 a ) : m128(a) {} - - __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } - //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } - - __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } - __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} - - __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } - __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } - __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } - __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } - - //__forceinline operator const __m128&() const { return m128; } - //__forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec3fa load( const void* const a ) { -#if defined(__aarch64__) - __m128 t = _mm_load_ps((float*)a); - t[3] = 0.0f; - return Vec3fa(t); -#else - return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); -#endif - } - - static __forceinline Vec3fa loadu( const void* const a ) { - return Vec3fa(_mm_loadu_ps((float*)a)); - } - - static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { - _mm_storeu_ps((float*)ptr,v.m128); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } - __forceinline Vec3fa operator -( const Vec3fa& a ) { -#if defined(__aarch64__) - return vnegq_f32(a.m128); -#else - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - - return _mm_xor_ps(a.m128, mask); -#endif - } - __forceinline Vec3fa abs ( const Vec3fa& a ) { -#if defined(__aarch64__) - return _mm_abs_ps(a.m128); -#else - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); -#endif - } - __forceinline Vec3fa sign ( const Vec3fa& a ) { -#if defined(__aarch64__) - Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f))); - return r; -#else - return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); -#endif - } - - __forceinline Vec3fa rcp ( const Vec3fa& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - return vdivq_f32(vdupq_n_f32(1.0f),a.m128); -#elif defined(__aarch64__) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Vec3fa)reciprocal; -#else - -#if defined(__AVX512VL__) - const Vec3fa r = _mm_rcp14_ps(a.m128); -#else - const Vec3fa r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); -#else - const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; -#endif //defined(__aarch64__) - } - - __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } - - __forceinline Vec3fa rsqrt( const Vec3fa& a ) - { -#if defined(__aarch64__) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#endif - } - - __forceinline Vec3fa zero_fix(const Vec3fa& a) { - return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec3fa rcp_safe(const Vec3fa& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec3fa log ( const Vec3fa& a ) { - return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); - } - - __forceinline Vec3fa exp ( const Vec3fa& a ) { - return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } - __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } - __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { - return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } -#else - -#if defined(__aarch64__) - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c; - } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c; - } - __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128); - return -t; - } - __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c - } - -#else - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} - __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } -#endif - -#endif - - __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } - __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } - __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } - __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } - __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } - __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } - __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } - __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } - __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline float reduce_add(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = 0.0f; - return vaddvq_f32(t); - } - - __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = t[2]; - return vminvq_f32(t); - } - __forceinline float reduce_max(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = t[2]; - return vmaxvq_f32(t); - } -#else - __forceinline float reduce_add(const Vec3fa& v) { - const vfloat4 a(v.m128); - const vfloat4 b = shuffle<1>(a); - const vfloat4 c = shuffle<2>(a); - return _mm_cvtss_f32(a+b+c); - } - - __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } - __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - - __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } - __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } - __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } - __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } - #if defined(__aarch64__) - __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } -#else - __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } -#endif - - __forceinline bool isvalid ( const Vec3fa& v ) { - return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); - } - - __forceinline bool is_finite ( const Vec3fa& a ) { - return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); - } - - __forceinline bool isvalid4 ( const Vec3fa& v ) { - return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite4 ( const Vec3fa& a ) { - return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); - } -#else - __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) - { - vfloat4 a0 = vfloat4(a.m128); - vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); - vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); - vfloat4 b1 = vfloat4(b.m128); - return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1))); - } - - __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } - __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } - __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } - - __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { - const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - } - - /*! differentiated normalization */ - __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) - { - const float pp = dot(p,p); - const float pdp = dot(p,dp); - return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f.m128, t.m128, mask); - } - - __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { - return blendv_ps(f.m128, t.m128, s); - } - - __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec3fa& a ) - { - const Vec3fa b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) - __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } - __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } - __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } -#elif defined (__SSE4_1__) - __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } -#else - __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } - __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - typedef Vec3fa Vec3fa_t; - - - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3fx Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3fx - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 3 }; - union { - __m128 m128; - struct { float x,y,z; union { int a; unsigned u; float w; }; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx( ) {} - __forceinline Vec3fx( const __m128 a ) : m128(a) {} - - __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} - __forceinline operator Vec3fa () const { return Vec3fa(m128); } - - __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } - //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } - - __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } - - __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} - - __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } - __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } - __forceinline Vec3fx( const Vec3fa& other, const float w1) { -#if defined (__aarch64__) - m128 = other.m128; m128[3] = w1; -#elif defined (__SSE4_1__) - m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); -#else - const vint4 mask(-1,-1,-1,0); - m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); -#endif - } - //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! - //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! - __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} - - //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } - __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } - __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } - __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } - - //__forceinline operator const __m128&() const { return m128; } - //__forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec3fx load( const void* const a ) { - return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); - } - - static __forceinline Vec3fx loadu( const void* const a ) { - return Vec3fx(_mm_loadu_ps((float*)a)); - } - - static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { - _mm_storeu_ps((float*)ptr,v.m128); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } - __forceinline Vec3fx operator -( const Vec3fx& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline Vec3fx abs ( const Vec3fx& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline Vec3fx sign ( const Vec3fx& a ) { - return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); - } - - __forceinline Vec3fx rcp ( const Vec3fx& a ) - { -#if defined(__AVX512VL__) - const Vec3fx r = _mm_rcp14_ps(a.m128); -#else - const Vec3fx r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); -#else - const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; - } - - __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } - - __forceinline Vec3fx rsqrt( const Vec3fx& a ) - { -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - } - - __forceinline Vec3fx zero_fix(const Vec3fx& a) { - return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec3fx rcp_safe(const Vec3fx& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec3fx log ( const Vec3fx& a ) { - return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); - } - - __forceinline Vec3fx exp ( const Vec3fx& a ) { - return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } - __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } - __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__SSE4_1__) || defined(__aarch64__) - __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__SSE4_1__) || defined(__aarch64__) - __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { - return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } -#else - __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } - __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } - __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} - __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } -#endif - - __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } - __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } - __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } - __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } - __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } - __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } - __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } - __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } - __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Vec3fx& v) { - const vfloat4 a(v.m128); - const vfloat4 b = shuffle<1>(a); - const vfloat4 c = shuffle<2>(a); - return _mm_cvtss_f32(a+b+c); - } - - __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } - __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - - __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } - __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } - __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } - __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } - __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } - - __forceinline bool isvalid ( const Vec3fx& v ) { - return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); - } - - __forceinline bool is_finite ( const Vec3fx& a ) { - return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); - } - - __forceinline bool isvalid4 ( const Vec3fx& v ) { - return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite4 ( const Vec3fx& a ) { - return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); - } -#else - __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) - { - vfloat4 a0 = vfloat4(a.m128); - vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); - vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); - vfloat4 b1 = vfloat4(b.m128); - return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); - } - - __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } - __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } - __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } - - __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { - const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - } - - /*! differentiated normalization */ - __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) - { - const float pp = dot(p,p); - const float pdp = dot(p,dp); - return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f.m128, t.m128, mask); - } - - __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { - return blendv_ps(f.m128, t.m128, s); - } - - __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec3fx& a ) - { - const Vec3fx b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined (__SSE4_1__) && !defined(__aarch64__) - __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } -#else - __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } - __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - - typedef Vec3fx Vec3ff; -} diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h deleted file mode 100644 index 737f67fd72..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3ia.h +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3ia Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3ia - { - ALIGNED_STRUCT_(16); - - union { - __m128i m128; - struct { int x,y,z; }; - }; - - typedef int Scalar; - enum { N = 3 }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia( ) {} - __forceinline Vec3ia( const __m128i a ) : m128(a) {} - __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} - __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} - __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} - __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} - - __forceinline operator const __m128i&() const { return m128; } - __forceinline operator __m128i&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} - __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} - __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} - __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } - __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } -#if (defined(__aarch64__)) - __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } -#elif defined(__SSSE3__) - __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } - __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } - - __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } - __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } - __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } -#endif - - __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } - __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } - __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } - - __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } - __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } - __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } - - __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } - __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } - __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } - -#if !defined(__ARM_NEON) - __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } - __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } - - __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } - __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } - __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } - __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } - - __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } - __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } - __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } -#endif - - __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } - __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } - - __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } - __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } - -#if !defined(__ARM_NEON) - __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } - __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) - __forceinline int reduce_add(const Vec3ia& v) { - int32x4_t t = v.m128; - t[3] = 0; - return vaddvq_s32(t); - - } - __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } - __forceinline int reduce_min(const Vec3ia& v) { - int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0); - return vminvq_s32(t); - - } - __forceinline int reduce_max(const Vec3ia& v) { - int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0); - return vmaxvq_s32(t); - - } -#else - __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } - __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } - __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } - __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } - __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } - __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } - __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } - __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { -#if defined(__aarch64__) || defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); -#endif - } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } - __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } -#else - __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } - __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h deleted file mode 100644 index d16542f507..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec4.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" -#include "vec3.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// Generic 4D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> struct Vec4 - { - enum { N = 4 }; - union { - struct { T x, y, z, w; }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec4( ) {} - __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} - __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} - __forceinline Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} - - __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } - __forceinline Vec4( const Vec3fx& other ); - - template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} - template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } - - __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } - - __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} - __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} - __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} - __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} - -#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler - __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Swizzles - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); } - template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); } - template<typename T> __forceinline Vec4<T> abs ( const Vec4<T>& a ) { return Vec4<T>(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } - template<typename T> __forceinline Vec4<T> rcp ( const Vec4<T>& a ) { return Vec4<T>(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } - template<typename T> __forceinline Vec4<T> rsqrt ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } - template<typename T> __forceinline Vec4<T> sqrt ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - template<typename T> __forceinline Vec4<T> operator *( const T& a, const Vec4<T>& b ) { return Vec4<T>(a * b.x, a * b.y, a * b.z, a * b.w); } - template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x * b , a.y * b , a.z * b , a.w * b ); } - template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x / b , a.y / b , a.z / b , a.w / b ); } - template<typename T> __forceinline Vec4<T> operator /( const T& a, const Vec4<T>& b ) { return Vec4<T>(a / b.x, a / b.y, a / b.z, a / b.w); } - - template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } - template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T> madd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> msub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } - - template<typename T> __forceinline Vec4<T> madd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> msub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } - template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } - template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } - template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } - template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; } - template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; } - template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); } - template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } - template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } - template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - if (a.w != b.w) return a.w < b.w; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) { - return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } - - template<typename T> __forceinline T length ( const Vec4<T>& a ) { return sqrt(dot(a,a)); } - template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a ) { return a*rsqrt(dot(a,a)); } - template<typename T> __forceinline T distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) { - return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); - } - - template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) { - return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); - } - - template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) { - return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); - } - - template<typename T> - __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) { - return madd(Vec4<T>(T(1.0f)-t),v0,t*v1); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Default template instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef Vec4<bool > Vec4b; - typedef Vec4<uint8_t > Vec4uc; - typedef Vec4<int > Vec4i; - typedef Vec4<float > Vec4f; -} - -#include "vec3ba.h" -#include "vec3ia.h" -#include "vec3fa.h" - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined __AVX__ -#include "../simd/avx.h" -#endif - -#if defined __AVX512F__ -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } - -#if defined(__AVX__) - template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { - x = a.x; y = a.y; z = a.z; w = a.w; - } -#elif defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { - const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); - } -#endif - -#if defined(__SSE__) || defined(__ARM_NEON) - __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) { - return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); - } -#endif - -#if defined(__AVX__) - template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) { - x = a.x; y = a.y; z = a.z; w = a.w; - } - __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat8>& a, const size_t k ) { - return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); - } - __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat4>& a, const size_t k ) { - return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); - } - __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat8>& a, const size_t k ) { - return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); - } -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h deleted file mode 100644 index c840e41805..0000000000 --- a/thirdparty/embree-aarch64/common/simd/avx.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "sse.h" - -#if defined(__AVX512VL__) -#include "vboolf8_avx512.h" -#include "vboold4_avx512.h" -#else -#include "vboolf8_avx.h" -#include "vboold4_avx.h" -#endif - -#if defined(__AVX2__) -#include "vint8_avx2.h" -#include "vuint8_avx2.h" -#if defined(__X86_64__) -#include "vllong4_avx2.h" -#endif -#else -#include "vint8_avx.h" -#include "vuint8_avx.h" -#endif -#include "vfloat8_avx.h" -#if defined(__X86_64__) -#include "vdouble4_avx.h" -#endif - -#if defined(__AVX512F__) -#include "avx512.h" -#endif - diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h deleted file mode 100644 index 25414ab5b1..0000000000 --- a/thirdparty/embree-aarch64/common/simd/avx512.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "../math/constants.h" -#include "../sys/alloc.h" -#include "varying.h" - -#include "vboolf16_avx512.h" -#include "vint16_avx512.h" -#include "vuint16_avx512.h" -#include "vfloat16_avx512.h" - -#include "vboold8_avx512.h" -#include "vllong8_avx512.h" -#include "vdouble8_avx512.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// Prefetching - //////////////////////////////////////////////////////////////////////////////// - -#define PFHINT_L1 0 -#define PFHINT_L2 1 -#define PFHINT_NT 2 - - template<const unsigned int mode> - __forceinline void prefetch(const void * __restrict__ const m) - { - if (mode == PFHINT_L1) - _mm_prefetch((const char*)m,_MM_HINT_T0); - else if (mode == PFHINT_L2) - _mm_prefetch((const char*)m,_MM_HINT_T1); - else if (mode == PFHINT_NT) - _mm_prefetch((const char*)m,_MM_HINT_NTA); - } -} diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h deleted file mode 100644 index 647851110b..0000000000 --- a/thirdparty/embree-aarch64/common/simd/simd.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -/* include SSE wrapper classes */ -#if defined(__SSE__) || defined(__ARM_NEON) -# include "sse.h" -#endif - -/* include AVX wrapper classes */ -#if defined(__AVX__) -# include "avx.h" -#endif - -/* include AVX512 wrapper classes */ -#if defined (__AVX512F__) -# include "avx512.h" -#endif - -namespace embree -{ - template <int N> - __forceinline vbool<N> isfinite(const vfloat<N>& v) - { - return (v >= vfloat<N>(-std::numeric_limits<float>::max())) - & (v <= vfloat<N>( std::numeric_limits<float>::max())); - } - - /* foreach unique */ - template<typename vbool, typename vint, typename Closure> - __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) - { - vbool valid1 = valid0; - while (any(valid1)) { - const int j = int(bsf(movemask(valid1))); - const int i = vi[j]; - const vbool valid2 = valid1 & (i == vi); - valid1 = andn(valid1, valid2); - closure(valid2, i); - } - } - - /* returns the next unique value i in vi and the corresponding valid_i mask */ - template<typename vbool, typename vint> - __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) - { - assert(any(valid)); - const int j = int(bsf(movemask(valid))); - const int i = vi[j]; - valid_i = valid & (i == vi); - valid = andn(valid, valid_i); - return i; - } - - /* foreach unique index */ - template<typename vbool, typename vint, typename Closure> - __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) - { - vbool valid1 = valid0; - while (any(valid1)) { - const int j = int(bsf(movemask(valid1))); - const int i = vi[j]; - const vbool valid2 = valid1 & (i == vi); - valid1 = andn(valid1, valid2); - closure(valid2, i, j); - } - } - - /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ - template<typename vbool, typename vint> - __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) - { - assert(any(valid)); - const int j = int(bsf(movemask(valid))); - const int i = vi[j]; - valid_i = valid & (i == vi); - valid = andn(valid, valid_i); - return j; - } - - template<typename Closure> - __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) - { - __aligned(64) int U[2*VSIZEX]; - __aligned(64) int V[2*VSIZEX]; - int index = 0; - for (int y=y0; y<y1; y++) { - const bool lasty = y+1>=y1; - const vintx vy = y; - for (int x=x0; x<x1; ) { //x+=VSIZEX) { - const bool lastx = x+VSIZEX >= x1; - vintx vx = x+vintx(step); - vintx::storeu(&U[index], vx); - vintx::storeu(&V[index], vy); - const int dx = min(x1-x,VSIZEX); - index += dx; - x += dx; - if (index >= VSIZEX || (lastx && lasty)) { - const vboolx valid = vintx(step) < vintx(index); - closure(valid, vintx::load(U), vintx::load(V)); - x-= max(0, index-VSIZEX); - index = 0; - } - } - } - } -} diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp deleted file mode 100644 index 1732cfa421..0000000000 --- a/thirdparty/embree-aarch64/common/simd/sse.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "sse.h" - -namespace embree -{ - const __m128 mm_lookupmask_ps[16] = { - _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) - }; - - const __m128d mm_lookupmask_pd[4] = { - _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), - _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), - _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), - _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) - }; - -} diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h deleted file mode 100644 index 6bc818b55b..0000000000 --- a/thirdparty/embree-aarch64/common/simd/sse.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "../sys/alloc.h" -#include "../math/constants.h" -#include "varying.h" - -namespace embree -{ -#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) - __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { - return _mm_blendv_ps(f,t,mask); - } -#else - __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { - return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); - } -#endif - - extern const __m128 mm_lookupmask_ps[16]; - extern const __m128d mm_lookupmask_pd[4]; -} - -#if defined(__AVX512VL__) -#include "vboolf4_avx512.h" -#else -#include "vboolf4_sse2.h" -#endif -#include "vint4_sse2.h" -#include "vuint4_sse2.h" -#include "vfloat4_sse2.h" diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h deleted file mode 100644 index 9a46817da9..0000000000 --- a/thirdparty/embree-aarch64/common/simd/varying.h +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" - -namespace embree -{ - /* Varying numeric types */ - template<int N> - struct vfloat - { - union { float f[N]; int i[N]; }; - __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } - }; - - template<int N> - struct vdouble - { - union { double f[N]; long long i[N]; }; - __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } - __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } - }; - - template<int N> - struct vint - { - int i[N]; - __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - template<int N> - struct vuint - { - unsigned int i[N]; - __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - template<int N> - struct vllong - { - long long i[N]; - __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - /* Varying bool types */ - template<int N> struct vboolf { int i[N]; }; // for float/int - template<int N> struct vboold { long long i[N]; }; // for double/long long - - /* Aliases to default types */ - template<int N> using vreal = vfloat<N>; - template<int N> using vbool = vboolf<N>; - - /* Varying size constants */ -#if defined(__AVX512VL__) // SKX - const int VSIZEX = 8; // default size - const int VSIZEL = 16; // large size -#elif defined(__AVX512F__) // KNL - const int VSIZEX = 16; - const int VSIZEL = 16; -#elif defined(__AVX__) - const int VSIZEX = 8; - const int VSIZEL = 8; -#else - const int VSIZEX = 4; - const int VSIZEL = 4; -#endif - - /* Extends varying size N to optimal or up to max(N, N2) */ - template<int N, int N2 = VSIZEX> - struct vextend - { -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */ - static const int size = (N2 == VSIZEX) ? VSIZEX : N; - #define SIMD_MODE(N) N, 16 -#else - /* calculate with same SIMD width otherwise */ - static const int size = N; - #define SIMD_MODE(N) N, N -#endif - }; - - /* 4-wide shortcuts */ - typedef vfloat<4> vfloat4; - typedef vdouble<4> vdouble4; - typedef vreal<4> vreal4; - typedef vint<4> vint4; - typedef vuint<4> vuint4; - typedef vllong<4> vllong4; - typedef vbool<4> vbool4; - typedef vboolf<4> vboolf4; - typedef vboold<4> vboold4; - - /* 8-wide shortcuts */ - typedef vfloat<8> vfloat8; - typedef vdouble<8> vdouble8; - typedef vreal<8> vreal8; - typedef vint<8> vint8; - typedef vuint<8> vuint8; - typedef vllong<8> vllong8; - typedef vbool<8> vbool8; - typedef vboolf<8> vboolf8; - typedef vboold<8> vboold8; - - /* 16-wide shortcuts */ - typedef vfloat<16> vfloat16; - typedef vdouble<16> vdouble16; - typedef vreal<16> vreal16; - typedef vint<16> vint16; - typedef vuint<16> vuint16; - typedef vllong<16> vllong16; - typedef vbool<16> vbool16; - typedef vboolf<16> vboolf16; - typedef vboold<16> vboold16; - - /* Default shortcuts */ - typedef vfloat<VSIZEX> vfloatx; - typedef vdouble<VSIZEX> vdoublex; - typedef vreal<VSIZEX> vrealx; - typedef vint<VSIZEX> vintx; - typedef vuint<VSIZEX> vuintx; - typedef vllong<VSIZEX> vllongx; - typedef vbool<VSIZEX> vboolx; - typedef vboolf<VSIZEX> vboolfx; - typedef vboold<VSIZEX> vbooldx; -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h deleted file mode 100644 index 6505ee56f3..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX bool type for 64bit data types*/ - template<> - struct vboold<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256d v; - struct { __m128d vl,vh; }; - long long i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold4& a) { v = a.v; } - __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } - - __forceinline vboold(__m256d a) : v(a) {} - __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} - - __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } - __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } - __forceinline operator const __m256d() const { return v; } - - __forceinline vboold(int a) - { - assert(a >= 0 && a <= 255); -#if defined (__AVX2__) - const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); - const __m256i b = _mm256_set1_epi64x(a); - const __m256i c = _mm256_and_si256(b,mask); - v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); -#else - vl = mm_lookupmask_pd[a & 0x3]; - vh = mm_lookupmask_pd[a >> 2]; -#endif - } - - __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} -#if !defined(__aarch64__) - __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} -#else - __forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } - __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } - __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } - __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } - - __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } - - __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } - __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } - __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } - __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } - - __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { - return _mm256_blendv_pd(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - -#if !defined(__aarch64__) - __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } - __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } -#endif - -#if defined(__AVX2__) - template<int i0, int i1, int i2, int i3> - __forceinline vboold4 shuffle(const vboold4& v) { - return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i> - __forceinline vboold4 shuffle(const vboold4& v) { - return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); - } -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } - __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } - - __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } - __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } - __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } - - __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } - __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } - __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } - - __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } - __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } - __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " - << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h deleted file mode 100644 index 4fe730d713..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX-512 bool type */ - template<> - struct vboold<4> - { - typedef vboold4 Bool; - typedef vint4 Int; - - enum { size = 4 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold4& t) { v = t.v; } - __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } - - __forceinline vboold(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } - __forceinline vboold(int t) { v = (__mmask8)t; } - __forceinline vboold(unsigned int t) { v = (__mmask8)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m256i mask64() const { - return _mm256_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(0x0) {} - __forceinline vboold(TrueTy) : v(0xf) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 4); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } - __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } - __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } - - __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } - __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } - __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } - __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } - - __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboold4& a) { return a.v == 0xf; } - __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } - __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } - __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } - __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } - __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) - { - cout << "<"; - for (size_t i=0; i<4; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h deleted file mode 100644 index fdf3f00de5..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 bool type */ - template<> - struct vboold<8> - { - typedef vboold8 Bool; - typedef vint8 Int; - - enum { size = 8 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold8& t) { v = t.v; } - __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } - - __forceinline vboold(const __mmask8& t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } - __forceinline vboold(int t) { v = (__mmask8)t; } - __forceinline vboold(unsigned int t) { v = (__mmask8)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { -#if defined(__AVX512BW__) - return _mm_movm_epi8(v); -#else - const __m512i f = _mm512_set1_epi64(0); - const __m512i t = _mm512_set1_epi64(-1); - const __m512i m = _mm512_mask_or_epi64(f,v,t,t); - return _mm512_cvtepi64_epi8(m); -#endif - } - - /* return int64 mask */ - __forceinline __m512i mask64() const { -#if defined(__AVX512DQ__) - return _mm512_movm_epi64(v); -#else - const __m512i f = _mm512_set1_epi64(0); - const __m512i t = _mm512_set1_epi64(-1); - return _mm512_mask_or_epi64(f,v,t,t); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(0x00) {} - __forceinline vboold(TrueTy) : v(0xff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 8); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } - __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } - __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } - - __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } - __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } - __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } - __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } - - __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboold8& a) { return a.v == 0xff; } - __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } - __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } - __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } - __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } - __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) - { - cout << "<"; - for (size_t i=0; i<8; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h deleted file mode 100644 index 238cdc8eb9..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 bool type */ - template<> - struct vboolf<16> - { - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - __mmask16 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf16& t) { v = t.v; } - __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask16& t) { v = t; } - __forceinline operator __mmask16() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } - __forceinline vboolf(int t) { v = (__mmask16)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { -#if defined(__AVX512BW__) - return _mm_movm_epi8(v); -#else - const __m512i f = _mm512_set1_epi32(0); - const __m512i t = _mm512_set1_epi32(-1); - const __m512i m = _mm512_mask_or_epi32(f,v,t,t); - return _mm512_cvtepi32_epi8(m); -#endif - } - - /* return int32 mask */ - __forceinline __m512i mask32() const { -#if defined(__AVX512DQ__) - return _mm512_movm_epi32(v); -#else - const __m512i f = _mm512_set1_epi32(0); - const __m512i t = _mm512_set1_epi32(-1); - return _mm512_mask_or_epi32(f,v,t,t); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x0000) {} - __forceinline vboolf(TrueTy) : v(0xffff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 16); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } - __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } - __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } - - __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } - __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } - __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } - __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } - - __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { - return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } - __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } - __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } - - __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } - __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } - __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Convertion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } - __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } - __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) - { - cout << "<"; - for (size_t i=0; i<16; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h deleted file mode 100644 index 2ae4c4470e..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX-512 bool type */ - template<> - struct vboolf<4> - { - typedef vboolf4 Bool; - typedef vint4 Int; - - enum { size = 4 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf4& t) { v = t.v; } - __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } - __forceinline vboolf(int t) { v = (__mmask8)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } - - __forceinline vboolf(bool a, bool b, bool c, bool d) - : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m256i mask64() const { - return _mm256_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x0) {} - __forceinline vboolf(TrueTy) : v(0xf) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 4); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } - __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } - __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } - - __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } - __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } - __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } - __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } - - __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf4& a) { return a.v == 0xf; } - __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } - __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } - __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } - __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) - { - cout << "<"; - for (size_t i=0; i<4; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h deleted file mode 100644 index ed53b3c783..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide SSE bool type */ - template<> - struct vboolf<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128 v; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf4& other) { v = other.v; } - __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } - - __forceinline vboolf(__m128 input) : v(input) {} - __forceinline operator const __m128&() const { return v; } - __forceinline operator const __m128i() const { return _mm_castps_si128(v); } - __forceinline operator const __m128d() const { return _mm_castps_pd(v); } - - __forceinline vboolf(bool a) - : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} - __forceinline vboolf(bool a, bool b) - : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} - __forceinline vboolf(bool a, bool b, bool c, bool d) - : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; } - __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; } -#else - __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } - __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } -#endif - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_castps_si128(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} - __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { return i[index]; } -#else - __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } - __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } - __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } - - __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } - __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } - __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } - __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - - __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { -#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) - return _mm_blendv_ps(f, t, m); -#else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } - __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } - -#if defined(__aarch64__) - template<int i0, int i1, int i2, int i3> - __forceinline vboolf4 shuffle(const vboolf4& v) { - return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { - return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template<int i0, int i1, int i2, int i3> - __forceinline vboolf4 shuffle(const vboolf4& v) { - return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } -#endif - - template<int i0> - __forceinline vboolf4 shuffle(const vboolf4& v) { - return shuffle<i0,i0,i0,i0>(v); - } - -#if defined(__SSE3__) - template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } - template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } - template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } -#endif - -#if defined(__SSE4_1__) && !defined(__aarch64__) - template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } - template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } - template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } - __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } - - __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } - __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } - __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } - - __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } - __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } - __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } -#if defined(__aarch64__) && defined(BUILD_IOS) -__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); } -#else -#if defined(__SSE4_2__) - __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } -#else - __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } -#endif -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } - __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h deleted file mode 100644 index 4f64741b55..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX bool type */ - template<> - struct vboolf<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256 v; - struct { __m128 vl,vh; }; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf8& a) { v = a.v; } - __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } - - __forceinline vboolf(__m256 a) : v(a) {} - __forceinline operator const __m256&() const { return v; } - __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } - __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } - - __forceinline vboolf(int a) - { - assert(a >= 0 && a <= 255); -#if defined (__AVX2__) - const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); - const __m256i b = _mm256_set1_epi32(a); - const __m256i c = _mm256_and_si256(b,mask); - v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); -#else - vl = mm_lookupmask_ps[a & 0xF]; - vh = mm_lookupmask_ps[a >> 4]; -#endif - } - - __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} - __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} - __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} - - __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} - __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} - __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} - __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} - - /* return int32 mask */ - __forceinline __m256i mask32() const { - return _mm256_castps_si256(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} -#if !defined(__aarch64__) - __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} -#else - __forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {} -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } - __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } - __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } - - __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } - - __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } - __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } - __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } - __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } - - __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { - return _mm256_blendv_ps(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } - __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } - - template<int i> - __forceinline vboolf8 shuffle(const vboolf8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template<int i0, int i1> - __forceinline vboolf8 shuffle4(const vboolf8& v) { - return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { - return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vboolf8 shuffle(const vboolf8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { - return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } - template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } - template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } - - template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } - template<int i> __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } - template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } - __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } - - __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } - __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } - __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } - - __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } - __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } - __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } - - __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } - __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } - __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " - << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h deleted file mode 100644 index 2a52b554c7..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 bool type */ - template<> - struct vboolf<8> - { - typedef vboolf8 Bool; - typedef vint8 Int; - - enum { size = 8 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf8& t) { v = t.v; } - __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } - __forceinline vboolf(int t) { v = (__mmask8)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } - - __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) - : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m256i mask32() const { - return _mm256_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m512i mask64() const { - return _mm512_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x00) {} - __forceinline vboolf(TrueTy) : v(0xff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 8); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } - __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } - __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } - - __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } - __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } - __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } - __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } - - __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf8& a) { return a.v == 0xff; } - __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } - __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } - __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } - __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) - { - cout << "<"; - for (size_t i=0; i<8; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h deleted file mode 100644 index 1f65b45d7e..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX 64-bit double type */ - template<> - struct vdouble<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256d v; - double i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble() {} - __forceinline vdouble(const vdouble4& t) { v = t.v; } - __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } - - __forceinline vdouble(const __m256d& t) { v = t; } - __forceinline operator __m256d() const { return v; } - - __forceinline vdouble(double i) { - v = _mm256_set1_pd(i); - } - - __forceinline vdouble(double a, double b, double c, double d) { - v = _mm256_set_pd(d,c,b,a); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} - __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} - __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} - __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { - _mm256_stream_pd(ptr, a); - } - - static __forceinline vdouble4 loadu(const double* addr) { - return _mm256_loadu_pd(addr); - } - - static __forceinline vdouble4 load(const vdouble4* addr) { - return _mm256_load_pd((double*)addr); - } - - static __forceinline vdouble4 load(const double* addr) { - return _mm256_load_pd(addr); - } - - static __forceinline void store(double* ptr, const vdouble4& v) { - _mm256_store_pd(ptr, v); - } - - static __forceinline void storeu(double* ptr, const vdouble4& v) { - _mm256_storeu_pd(ptr, v); - } - - static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } - __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } - __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } -#endif - - __forceinline vdouble4 operator +(const vdouble4& a) { return a; } - __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } - __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } - __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } - - __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } - __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } - __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } - - __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } - __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } - __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } - - __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } - __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } - __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } - - __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } - __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } - __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } - - __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } - __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } - __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } - - __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } - __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } - __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } - - __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } - __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } - __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__FMA__) - __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } - __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } - __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } - __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } -#else - __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } - __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } - __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} - __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } - __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } - - __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } - __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } - - __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } - __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } - - __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } - __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } - - __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } - __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } -#elif !defined(__aarch64__) - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } -#else - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); } -#endif - - __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } - __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } - - __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } - __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } - - __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } - __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } - - __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } - __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } - - __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } - __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } - - __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } - __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } - - __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } - __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } - __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } - __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } - __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } - __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } - __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } - __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } - __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } - __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } - __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } -#endif - - __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { -#if defined(__AVX512VL__) - return _mm256_mask_blend_pd(m, f, t); -#else - return _mm256_blendv_pd(f, t, m); -#endif - } - - __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) { - const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) { -#if defined(__AVX512VL__) - return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); -#else - return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template<int i0, int i1> - __forceinline vdouble4 shuffle(const vdouble4& v) { - return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); - } - - template<int i> - __forceinline vdouble4 shuffle(const vdouble4& v) { - return shuffle<i, i>(v); - } - - template<int i0, int i1> - __forceinline vdouble4 shuffle2(const vdouble4& v) { - return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); - } - - __forceinline double toScalar(const vdouble4& v) { - return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } - __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } - - __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } - __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } - - __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } - - __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } - - __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } - - __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } - __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } - __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<4; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h deleted file mode 100644 index 4eec7d2f6a..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 64-bit double type */ - template<> - struct vdouble<8> - { - ALIGNED_STRUCT_(64); - - typedef vboold8 Bool; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m512d v; - double i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble() {} - __forceinline vdouble(const vdouble8& t) { v = t.v; } - __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } - - __forceinline vdouble(const __m512d& t) { v = t; } - __forceinline operator __m512d() const { return v; } - __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } - - __forceinline vdouble(double i) { - v = _mm512_set1_pd(i); - } - - __forceinline vdouble(double a, double b, double c, double d) { - v = _mm512_set4_pd(d,c,b,a); - } - - __forceinline vdouble(double a0, double a1, double a2, double a3, - double a4, double a5, double a6, double a7) - { - v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} - __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} - __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} - __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { - _mm512_stream_pd((double*)ptr, a); - } - - static __forceinline vdouble8 loadu(const void* addr) { - return _mm512_loadu_pd((double*)addr); - } - - static __forceinline vdouble8 load(const vdouble8* addr) { - return _mm512_load_pd((double*)addr); - } - - static __forceinline vdouble8 load(const double* addr) { - return _mm512_load_pd(addr); - } - - static __forceinline void store(void* ptr, const vdouble8& v) { - _mm512_store_pd(ptr, v); - } - - static __forceinline void storeu(void* ptr, const vdouble8& v) { - _mm512_storeu_pd(ptr, v); - } - - static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { - _mm512_mask_storeu_pd(ptr, mask, f); - } - - static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { - _mm512_mask_store_pd(addr, mask, v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) { - _mm512_mask_compressstoreu_pd(addr, mask, reg); - } - - static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) { - return _mm512_mask_compress_pd(v, mask, v); - } - - static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { - return _mm512_mask_compress_pd(v, mask, v); - } - - static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { - return _mm512_mask_compress_pd(a, mask, b); - } - - static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } - __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } - __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } - - __forceinline vdouble8 operator +(const vdouble8& a) { return a; } - __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } - __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } - __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } - - __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } - __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } - __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } - - __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } - __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } - __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } - - __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } - __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } - __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } - - __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } - __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } - __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } - - __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } - __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } - __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } - - __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } - __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } - - __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } - __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } - - __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } - __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } - __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } - - __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } - __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } - __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } - - __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } - __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } - __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } - - __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } - __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } - - __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } - __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } - __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } - __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } - __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } - __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } - - __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } - __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } - - __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } - __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } - - __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } - __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } - - __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } - __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } - - __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } - __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } - __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } - - __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } - __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } - - __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } - __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } - - __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } - __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } - - __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } - __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } - - __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } - __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } - - __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { - return _mm512_mask_or_pd(f,m,t,t); - } - - __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) { - const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) { - return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b)); - } - - __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) { - return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b)); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template<int i0, int i1> - __forceinline vdouble8 shuffle(const vdouble8& v) { - return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); - } - - template<int i> - __forceinline vdouble8 shuffle(const vdouble8& v) { - return shuffle<i, i>(v); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vdouble8 shuffle(const vdouble8& v) { - return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i0, int i1> - __forceinline vdouble8 shuffle4(const vdouble8& v) { - return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); - } - - template<int i> - __forceinline vdouble8 shuffle4(const vdouble8& v) { - return shuffle4<i, i>(v); - } - - template<int i> - __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { - return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); - } - - __forceinline double toScalar(const vdouble8& v) { - return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } - - __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } - - __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } - - __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } - __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } - __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { - return _mm512_permutexvar_pd(index, v); - } - - __forceinline vdouble8 reverse(const vdouble8& a) { - return permute(a, vllong8(reverse_step)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<8; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h deleted file mode 100644 index aed2419b77..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h +++ /dev/null @@ -1,771 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 float type */ - template<> - struct vfloat<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512 v; - float f[16]; - int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat16& t) { v = t; } - __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } - - __forceinline vfloat(const __m512& t) { v = t; } - __forceinline operator __m512() const { return v; } - __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } - __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } - - __forceinline vfloat(float f) { - v = _mm512_set1_ps(f); - } - - __forceinline vfloat(float a, float b, float c, float d) { - v = _mm512_set4_ps(a, b, c, d); - } - - __forceinline vfloat(const vfloat4& i) { - v = _mm512_broadcast_f32x4(i); - } - - __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { - v = _mm512_castps128_ps512(a); - v = _mm512_insertf32x4(v, b, 1); - v = _mm512_insertf32x4(v, c, 2); - v = _mm512_insertf32x4(v, d, 3); - } - - __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { - v = _mm512_broadcast_f32x4(a); - v = _mm512_mask_broadcast_f32x4(v,mask,b); - } - - __forceinline vfloat(const vfloat8& i) { - v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); - } - - __forceinline vfloat(const vfloat8& a, const vfloat8& b) { - v = _mm512_castps256_ps512(a); -#if defined(__AVX512DQ__) - v = _mm512_insertf32x8(v, b, 1); -#else - v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); -#endif - } - - /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ - __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { - __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); - aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); - v = _mm512_castpd_ps(aa); - } - - __forceinline explicit vfloat(const vint16& a) { - v = _mm512_cvtepi32_ps(a); - } - - __forceinline explicit vfloat(const vuint16& a) { - v = _mm512_cvtepu32_ps(a); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } - static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } - - static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } - - static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } - - static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { - _mm512_stream_ps((float*)ptr,a); - } - - static __forceinline vfloat16 broadcast(const float* f) { - return _mm512_set1_ps(*f); - } - - static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) { - return _mm512_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) { - return _mm512_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) { - return _mm512_mask_expand_ps(b, mask, a); - } - - static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) { - return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr); - } - - static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) { - _mm512_mask_compressstoreu_ps(addr, mask, reg); - } - - static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) { - //_mm512_mask_compressstoreu_ps(addr,mask,reg); - *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg)); - } - - template<int scale = 4> - static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { - return _mm512_i32gather_ps(index, ptr, scale); - } - - template<int scale = 4> - static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { - vfloat16 r = zero; - return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); - } - - template<int scale = 4> - static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { - _mm512_i32scatter_ps(ptr, index, v, scale); - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { - _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } - __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } - __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } - __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } - - __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } - __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } - - __forceinline vfloat16 operator +(const vfloat16& a) { return a; } - __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } - - __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } - __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } - - __forceinline vfloat16 rcp(const vfloat16& a) { -#if defined(__AVX512ER__) - return _mm512_rcp28_ps(a); -#else - const vfloat16 r = _mm512_rcp14_ps(a); - return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); -#endif - } - - __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } - __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } - - __forceinline vfloat16 rsqrt(const vfloat16& a) - { -#if defined(__AVX512VL__) - const vfloat16 r = _mm512_rsqrt14_ps(a); - return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, - _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); -#else - return _mm512_rsqrt28_ps(a); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } - __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } - __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } - - __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } - __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } - __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } - - __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } - __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } - __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } - - __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } - __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } - __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } - - __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } - __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } - __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); - } - - __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { - return _mm512_min_ps(a,b); - } - __forceinline vfloat16 min(const vfloat16& a, float b) { - return _mm512_min_ps(a,vfloat16(b)); - } - __forceinline vfloat16 min(const float& a, const vfloat16& b) { - return _mm512_min_ps(vfloat16(a),b); - } - - __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { - return _mm512_max_ps(a,b); - } - __forceinline vfloat16 max(const vfloat16& a, float b) { - return _mm512_max_ps(a,vfloat16(b)); - } - __forceinline vfloat16 max(const float& a, const vfloat16& b) { - return _mm512_max_ps(vfloat16(a),b); - } - - __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); } - __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_mask_min_ps(c,mask,a,b); - }; - __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_mask_max_ps(c,mask,a,b); - }; - - __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { -#if !defined(__AVX512ER__) // SKX - const vint16 ai = _mm512_castps_si512(a); - const vint16 bi = _mm512_castps_si512(b); - const vint16 ci = _mm512_min_epi32(ai,bi); - return _mm512_castsi512_ps(ci); -#else // KNL - return min(a,b); -#endif - } - - __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { -#if !defined(__AVX512ER__) // SKX - const vint16 ai = _mm512_castps_si512(a); - const vint16 bi = _mm512_castps_si512(b); - const vint16 ci = _mm512_max_epi32(ai,bi); - return _mm512_castsi512_ps(ci); -#else // KNL - return max(a,b); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } - __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } - __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } - __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } - - __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); } - - __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); } - __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } - __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); } - __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Operators with rounding - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 madd_round_up (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mul_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 add_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 sub_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 div_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_msub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_mul_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_sub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } - __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } - - __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } - __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } - - __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } - __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } - - __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } - __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } - __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } - - __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } - __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } - - __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } - __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } - - __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } - __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } - - __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } - __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } - - __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } - __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } - - __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { - return _mm512_mask_blend_ps(s, f, t); - } - - __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { - return madd(t,b-a,a); - } - - __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b) - { - vfloat16 c = a; - a = select(m,b,a); - b = select(m,c,b); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 floor(const vfloat16& a) { - return _mm512_floor_ps(a); - } - __forceinline vfloat16 ceil (const vfloat16& a) { - return _mm512_ceil_ps(a); - } - __forceinline vfloat16 round (const vfloat16& a) { - return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - } - __forceinline vint16 floori (const vfloat16& a) { - return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } - __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } - - template<int i> - __forceinline vfloat16 shuffle(const vfloat16& v) { - return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vfloat16 shuffle(const vfloat16& v) { - return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i> - __forceinline vfloat16 shuffle4(const vfloat16& v) { - return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vfloat16 shuffle4(const vfloat16& v) { - return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) { - /* mask should be 8-bit but is 16-bit to reuse for interleave_even */ - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) { - /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */ - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); - } - - __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); - } - - __forceinline vfloat16 permute(vfloat16 v, __m512i index) { - return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); - } - - __forceinline vfloat16 reverse(const vfloat16& v) { - return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); - } - - template<int i> - __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); - }; - - template<int i> - __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); - }; - - __forceinline vfloat16 shift_left_1(const vfloat16& a) { - vfloat16 z = zero; - return mask_align_shift_right<15>(0xfffe,z,a,a); - } - - __forceinline vfloat16 shift_right_1(const vfloat16& x) { - return align_shift_right<1>(zero,x); - } - - __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } - - - template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } - - template<int N, int i> - vfloat<N> extractN(const vfloat16& v); - - template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } - template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } - template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } - template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } - - template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } - template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } - - template<int i> __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } - template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } - - template<int i> __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } - template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) - { -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - vfloat16 a0a1_c0c1 = interleave_even(r0, r1); - vfloat16 a2a3_c2c3 = interleave_even(r2, r3); - vfloat16 b0b1_d0d1 = interleave_odd (r0, r1); - vfloat16 b2b3_d2d3 = interleave_odd (r2, r3); - - c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3); - c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3); - c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3); - c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3); -#else - vfloat16 a0a2_b0b2 = unpacklo(r0, r2); - vfloat16 c0c2_d0d2 = unpackhi(r0, r2); - vfloat16 a1a3_b1b3 = unpacklo(r1, r3); - vfloat16 c1c3_d1d3 = unpackhi(r1, r3); - - c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); - c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); - c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); - c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); -#endif - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, - const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, - const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) - { - return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), - c0, c1, c2, c3); - } - - __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, - const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, - vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) - { - vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; - transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); - - vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; - transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); - - c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); - c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); - c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); - c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); - c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); - c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); - c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); - c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, - const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, - const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, - const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, - vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) - { - return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), - vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), - c0, c1, c2, c3, c4, c5, c6, c7); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } - __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } - - __forceinline size_t select_min(const vfloat16& v) { - return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); - } - - __forceinline size_t select_max(const vfloat16& v) { - return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); - } - - __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) - { - const vfloat16 a = select(valid,v,vfloat16(pos_inf)); - const vbool16 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - - __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) - { - const vfloat16 a = select(valid,v,vfloat16(neg_inf)); - const vbool16 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - __forceinline vfloat16 prefix_sum(const vfloat16& a) - { - const vfloat16 z(zero); - vfloat16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) - { - const vfloat16 z(zero); - vfloat16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - __forceinline vfloat16 prefix_min(const vfloat16& a) - { - const vfloat16 z(pos_inf); - vfloat16 v = a; - v = min(v,align_shift_right<16-1>(v,z)); - v = min(v,align_shift_right<16-2>(v,z)); - v = min(v,align_shift_right<16-4>(v,z)); - v = min(v,align_shift_right<16-8>(v,z)); - return v; - } - - __forceinline vfloat16 prefix_max(const vfloat16& a) - { - const vfloat16 z(neg_inf); - vfloat16 v = a; - v = max(v,align_shift_right<16-1>(v,z)); - v = max(v,align_shift_right<16-2>(v,z)); - v = max(v,align_shift_right<16-4>(v,z)); - v = max(v,align_shift_right<16-8>(v,z)); - return v; - } - - - __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) - { - const vfloat16 z(pos_inf); - vfloat16 v = a; - v = min(v,align_shift_right<1>(z,v)); - v = min(v,align_shift_right<2>(z,v)); - v = min(v,align_shift_right<4>(z,v)); - v = min(v,align_shift_right<8>(z,v)); - return v; - } - - __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) - { - const vfloat16 z(neg_inf); - vfloat16 v = a; - v = max(v,align_shift_right<1>(z,v)); - v = max(v,align_shift_right<2>(z,v)); - v = max(v,align_shift_right<4>(z,v)); - v = max(v,align_shift_right<8>(z,v)); - return v; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z) - { - vfloat16 f = zero; - f = select(0x1111,vfloat16::broadcast(&x),f); - f = select(0x2222,vfloat16::broadcast(&y),f); - f = select(0x4444,vfloat16::broadcast(&z),f); - return f; - } - - __forceinline vfloat16 loadAOS4to16f(unsigned int index, - const vfloat16& x, - const vfloat16& y, - const vfloat16& z) - { - vfloat16 f = zero; - f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); - f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); - f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); - return f; - } - - __forceinline vfloat16 loadAOS4to16f(unsigned int index, - const vfloat16& x, - const vfloat16& y, - const vfloat16& z, - const vfloat16& fill) - { - vfloat16 f = fill; - f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); - f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); - f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); - return f; - } - - __forceinline vfloat16 rcp_safe(const vfloat16& a) { - return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h deleted file mode 100644 index 5732c0fbc8..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h +++ /dev/null @@ -1,925 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide SSE float type */ - template<> - struct vfloat<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128 v; float f[4]; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat4& other) { v = other.v; } - __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } - - __forceinline vfloat(__m128 a) : v(a) {} - __forceinline operator const __m128&() const { return v; } - __forceinline operator __m128&() { return v; } - - __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} - __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} - - __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} -#if defined(__aarch64__) - __forceinline explicit vfloat(const vuint4& x) { - v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); - } -#else - __forceinline explicit vfloat(const vuint4& x) { - const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); - const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 - const __m128 af = _mm_cvtepi32_ps(a); - const __m128 bf = _mm_castsi128_ps(b); - v = _mm_add_ps(af,bf); - } -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} - __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } - static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } - - static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) { - return _mm_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) { - return _mm_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } -#else - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } -#endif - -#if defined(__AVX__) - static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } -#else - static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } -#endif - - static __forceinline vfloat4 load_nt (const float* ptr) { -#if defined (__SSE4_1__) - return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); -#else - return _mm_load_ps(ptr); -#endif - } - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const int8_t* ptr) { - return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const int8_t* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const int8_t* ptr) { - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const uint8_t* ptr) { - return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const uint8_t* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const uint8_t* ptr) { - //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const short* ptr) { - return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const short* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const short* ptr) { - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - - static __forceinline vfloat4 load(const unsigned short* ptr) { - return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); - } - - static __forceinline void store_nt(void* ptr, const vfloat4& v) - { -#if defined (__SSE4_1__) -#if defined(__aarch64__) - _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v)); -#else - _mm_stream_ps((float*)ptr,v); -#endif -#else - _mm_store_ps((float*)ptr,v); -#endif - } - - template<int scale = 4> - static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_ps(ptr, index, scale); -#else - return vfloat4( - *(float*)(((int8_t*)ptr)+scale*index[0]), - *(float*)(((int8_t*)ptr)+scale*index[1]), - *(float*)(((int8_t*)ptr)+scale*index[2]), - *(float*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template<int scale = 4> - static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { - vfloat4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) - { -#if defined(__AVX512VL__) - _mm_i32scatter_ps((float*)ptr, index, v, scale); -#else - *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) - { -#if defined(__AVX512VL__) - _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); -#else - if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) { - scatter<1>(mask,ptr,ofs,v); - } - static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { - scatter<4>(mask,ptr,ofs,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } - - friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_ps(m, f, t); -#elif defined(__SSE4_1__) || (defined(__aarch64__)) - return _mm_blendv_ps(f, t, m); -#else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -#endif - } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } - __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } - __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } - - __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } - __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } - - __forceinline vfloat4 operator +(const vfloat4& a) { return a; } -#if defined(__aarch64__) - __forceinline vfloat4 operator -(const vfloat4& a) { - return vnegq_f32(a); - } -#else - __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } -#endif - -#if defined(__aarch64__) - __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } -#else - __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } -#endif - -#if defined(__AVX512VL__) - __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } -#else - __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } -#endif - -#if defined(__aarch64__) - __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); } -#else - __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } -#endif - - __forceinline vfloat4 rcp(const vfloat4& a) - { -#if defined(__aarch64__) -#if defined(BUILD_IOS) - return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); -#else //BUILD_IOS - __m128 reciprocal = _mm_rcp_ps(a); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp. - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - return (const vfloat4)reciprocal; -#endif // BUILD_IOS -#else - -#if defined(__AVX512VL__) - const vfloat4 r = _mm_rcp14_ps(a); -#else - const vfloat4 r = _mm_rcp_ps(a); -#endif - -#if defined(__AVX2__) - return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); -#else - return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); -#endif - -#endif //defined(__aarch64__) - } - __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } - __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } - - __forceinline vfloat4 rsqrt(const vfloat4& a) - { -#if defined(__aarch64__) - vfloat4 r = _mm_rsqrt_ps(a); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - const vfloat4 r = _mm_rsqrt14_ps(a); -#else - const vfloat4 r = _mm_rsqrt_ps(a); -#endif - -#if defined(__AVX2__) - return _mm_fmadd_ps(_mm_set1_ps(1.5f), r, - _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#else - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), - _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#endif - -#endif - } - - __forceinline vboolf4 isnan(const vfloat4& a) { -#if defined(__aarch64__) - const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff)); -#else - const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); -#endif -#if defined(__AVX512VL__) - return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); -#else - return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } - __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } - __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } - - __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } - __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } - __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } - - __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } - __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } - __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } - - __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } - __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } - __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } - - __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } - __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } - __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } - __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } - - __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } - __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } - __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } - - __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } - __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } - __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } - -#if defined(__SSE4_1__) || defined(__aarch64__) - - __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epu32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epu32(ai,bi); - return _mm_castsi128_ps(ci); - } -#else - __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { - return min(a,b); - } - - __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { - return max(a,b); - } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } -#else - -#if defined(__aarch64__) - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return _mm_madd_ps(a, b, c); //a*b+c; - } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return _mm_msub_ps(a, b, c); //-a*b+c; - } - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return vnegq_f32(vfmaq_f32(c,a, b)); - } -#else - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } -#endif - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } - __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } - - __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } - __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } - - __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } - __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } - - __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } - __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } - __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } - __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } -#if defined(__aarch64__) - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } -#else - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } -#endif - __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } -#endif - - __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } - __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } - - __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } - __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } - - __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } - __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } - - __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } - __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } - - __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } - __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } - - __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } - __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } - - __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } - __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } - __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } - __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } - __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } - __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } - __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } - __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } - __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } - __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } -#endif - - template<int mask> - __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) - { -#if defined(__SSE4_1__) - return _mm_blend_ps(f, t, mask); -#else - return select(vboolf4(mask), t, f); -#endif - } - -#if defined(__aarch64__) - template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero)); - } - template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F)); - } - template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0)); - } - template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF)); - } - template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00)); - } - template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F)); - } - template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0)); - } - template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF)); - } - template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000)); - } - template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F)); - } - template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0)); - } - template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF)); - } - template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00)); - } - template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F)); - } - template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0)); - } - template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF)); - } -#endif - - __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { - return madd(t,b-a,a); - } - - __forceinline bool isvalid(const vfloat4& v) { - return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite(const vfloat4& a) { - return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); - } - - __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { - return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) - __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf - __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf - __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 - __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? -#elif defined (__SSE4_1__) - __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } - __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } - __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd -#else - __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } - __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } - __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } - __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } -#endif - __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } - - __forceinline vint4 floori(const vfloat4& a) { -#if defined(__aarch64__) - return vcvtq_s32_f32(floor(a)); -#elif defined(__SSE4_1__) - return vint4(floor(a)); -#else - return vint4(a-vfloat4(0.5f)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } - __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } - -#if defined(__aarch64__) - template<int i0, int i1, int i2, int i3> - __forceinline vfloat4 shuffle(const vfloat4& v) { - return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template<int i0, int i1, int i2, int i3> - __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { - return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template<int i0, int i1, int i2, int i3> - __forceinline vfloat4 shuffle(const vfloat4& v) { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } -#endif - -#if defined (__SSSE3__) - __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) { - return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); - } -#endif - -#if defined(__aarch64__) - template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); } - template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); } - template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); } -#elif defined(__SSE3__) - template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } - template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } - template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } -#endif - - template<int i> - __forceinline vfloat4 shuffle(const vfloat4& v) { - return shuffle<i,i,i,i>(v); - } - -#if defined(__aarch64__) - template<int i> __forceinline float extract(const vfloat4& a); - template<> __forceinline float extract<0>(const vfloat4& b) { - return b[0]; - } - template<> __forceinline float extract<1>(const vfloat4& b) { - return b[1]; - } - template<> __forceinline float extract<2>(const vfloat4& b) { - return b[2]; - } - template<> __forceinline float extract<3>(const vfloat4& b) { - return b[3]; - } -#elif defined (__SSE4_1__) && !defined(__GNUC__) - template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } - template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } -#else - template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); } - template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } -#endif - - -#if defined(__aarch64__) - template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b); - template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[3] = b; - return c; - } -#elif defined (__SSE4_1__) - template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } - template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } - template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } -#else - template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } - template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - __forceinline float toScalar(const vfloat4& v) { - return v[0]; - } -#else - __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } -#endif - __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) { - return vfloat4::broadcast(&a[k]); - } - - __forceinline vfloat4 shift_right_1(const vfloat4& x) { - return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); - } - -#if defined (__AVX2__) - __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { - return _mm_permutevar_ps(a,index); - } - - __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } - -#endif - -#if defined(__AVX512VL__) - template<int i> - __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { - return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); - } -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting Network - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 sort_ascending(const vfloat4& v) - { - const vfloat4 a0 = v; - const vfloat4 b0 = shuffle<1,0,3,2>(a0); - const vfloat4 c0 = min(a0,b0); - const vfloat4 d0 = max(a0,b0); - const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vfloat4 b1 = shuffle<2,3,0,1>(a1); - const vfloat4 c1 = min(a1,b1); - const vfloat4 d1 = max(a1,b1); - const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vfloat4 b2 = shuffle<0,2,1,3>(a2); - const vfloat4 c2 = min(a2,b2); - const vfloat4 d2 = max(a2,b2); - const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - __forceinline vfloat4 sort_descending(const vfloat4& v) - { - const vfloat4 a0 = v; - const vfloat4 b0 = shuffle<1,0,3,2>(a0); - const vfloat4 c0 = max(a0,b0); - const vfloat4 d0 = min(a0,b0); - const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vfloat4 b1 = shuffle<2,3,0,1>(a1); - const vfloat4 c1 = max(a1,b1); - const vfloat4 d1 = min(a1,b1); - const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vfloat4 b2 = shuffle<0,2,1,3>(a2); - const vfloat4 c2 = max(a2,b2); - const vfloat4 d2 = min(a2,b2); - const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) - { - vfloat4 l02 = unpacklo(r0,r2); - vfloat4 h02 = unpackhi(r0,r2); - vfloat4 l13 = unpacklo(r1,r3); - vfloat4 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - c3 = unpackhi(h02,h13); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) - { - vfloat4 l02 = unpacklo(r0,r2); - vfloat4 h02 = unpackhi(r0,r2); - vfloat4 l13 = unpacklo(r1,r3); - vfloat4 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) - __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } - __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } - __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } -#else - __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } -#endif - -#if defined(__aarch64__) - __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } - __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } - __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } -#else - __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } - __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } -#endif - - __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) - { - const vfloat4 a = select(valid,v,vfloat4(pos_inf)); - const vbool4 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) - { - const vfloat4 a = select(valid,v,vfloat4(neg_inf)); - const vbool4 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float dot(const vfloat4& a, const vfloat4& b) { - return reduce_add(a*b); - } - - __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) - { - const vfloat4 a0 = a; - const vfloat4 b0 = shuffle<1,2,0,3>(b); - const vfloat4 a1 = shuffle<1,2,0,3>(a); - const vfloat4 b1 = b; - return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } - -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h deleted file mode 100644 index 3c7e4a8cdc..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h +++ /dev/null @@ -1,847 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX float type */ - template<> - struct vfloat<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { __m256 v; float f[8]; int i[8]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat8& other) { v = other.v; } - __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } - - __forceinline vfloat(__m256 a) : v(a) {} - __forceinline operator const __m256&() const { return v; } - __forceinline operator __m256&() { return v; } - - __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} - __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} - - __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {} - __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} - __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} - __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} - __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} - __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat8 broadcast(const void* a) { - return _mm256_broadcast_ss((float*)a); - } - - static __forceinline vfloat8 broadcast2(const float* a, const float* b) { -#if defined(__INTEL_COMPILER) - const vfloat8 v0 = _mm256_broadcast_ss(a); - const vfloat8 v1 = _mm256_broadcast_ss(b); - return _mm256_blend_ps(v1, v0, 0xf); -#else - return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a); -#endif - } - - static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) { - return _mm256_broadcast_ps((__m128*)ptr); - } - - static __forceinline vfloat8 load(const int8_t* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load(const uint8_t* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load(const short* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } - static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } - - static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) { - return _mm256_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) { - return _mm256_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } -#elif defined(__aarch64__) - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } -#else - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } -#endif - -#if defined(__AVX2__) - static __forceinline vfloat8 load_nt(void* ptr) { - return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); - } -#endif - - static __forceinline void store_nt(void* ptr, const vfloat8& v) { - _mm256_stream_ps((float*)ptr,v); - } - - template<int scale = 4> - static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm256_i32gather_ps(ptr, index ,scale); -#else - return vfloat8( - *(float*)(((int8_t*)ptr)+scale*index[0]), - *(float*)(((int8_t*)ptr)+scale*index[1]), - *(float*)(((int8_t*)ptr)+scale*index[2]), - *(float*)(((int8_t*)ptr)+scale*index[3]), - *(float*)(((int8_t*)ptr)+scale*index[4]), - *(float*)(((int8_t*)ptr)+scale*index[5]), - *(float*)(((int8_t*)ptr)+scale*index[6]), - *(float*)(((int8_t*)ptr)+scale*index[7])); -#endif - } - - template<int scale = 4> - static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { - vfloat8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]); - return r; - #endif - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); -#else - *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) { - scatter<1>(mask,ptr,ofs,v); - } - static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) { - scatter<4>(mask,ptr,ofs,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } - __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } - - __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } - __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } - - __forceinline vfloat8 operator +(const vfloat8& a) { return a; } -#if !defined(__aarch64__) - __forceinline vfloat8 operator -(const vfloat8& a) { - const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - return _mm256_xor_ps(a, mask); - } -#else - __forceinline vfloat8 operator -(const vfloat8& a) { - __m256 res; - res.lo = vnegq_f32(a.v.lo); - res.hi = vnegq_f32(a.v.hi); - return res; -} -#endif - -#if !defined(__aarch64__) -__forceinline vfloat8 abs(const vfloat8& a) { - const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); - return _mm256_and_ps(a, mask); -} -#else -__forceinline vfloat8 abs(const vfloat8& a) { - __m256 res; - res.lo = vabsq_f32(a.v.lo); - res.hi = vabsq_f32(a.v.hi); - return res; -} -#endif - -#if !defined(__aarch64__) - __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } -#else - __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); } -#endif - __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } - - - static __forceinline vfloat8 rcp(const vfloat8& a) - { -#if defined(BUILD_IOS) && defined(__aarch64__) - // ios devices are faster doing full divide, no need for NR fixup - vfloat8 ret; - const float32x4_t one = vdupq_n_f32(1.0f); - ret.v.lo = vdivq_f32(one, a.v.lo); - ret.v.hi = vdivq_f32(one, a.v.hi); - return ret; -#endif - -#if defined(__AVX512VL__) - const vfloat8 r = _mm256_rcp14_ps(a); -#else - const vfloat8 r = _mm256_rcp_ps(a); -#endif - -#if defined(__AVX2__) //&& !defined(aarch64) - return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); -#else - return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); -#endif - } - __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } - __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } - - static __forceinline vfloat8 rsqrt(const vfloat8& a) - { -#if defined(__AVX512VL__) - const vfloat8 r = _mm256_rsqrt14_ps(a); -#else - const vfloat8 r = _mm256_rsqrt_ps(a); -#endif - -#if defined(__AVX2__) - return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, - _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); -#else - return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), - _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } - __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } - __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } - - __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } - __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } - __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } - - __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } - __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } - __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } - - __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } - __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } - __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } - - __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } - __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } - __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } - __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } - - __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } - __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } - __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } - - __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } - __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } - __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } - - /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ -#if defined(__AVX2__) - - static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_min_epi32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_max_epi32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_min_epu32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_max_epu32(ai,bi); - return _mm256_castsi256_ps(ci); - } - -#else - - static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { - return asFloat(min(asInt(a),asInt(b))); - } - - static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { - return asFloat(max(asInt(a),asInt(b))); - } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } - static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } - static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } - static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } -#else - static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } - static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } - static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} - static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } - __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } - - __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } - __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } - - __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } - __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } - - __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } - __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } - - static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_mask_blend_ps(m, f, t); - } -#elif !defined(__aarch64__) - __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } - __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } - __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } - - __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_blendv_ps(f, t, m); - } -#else - __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); } - __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); } - __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); } - __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); } - __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); } - __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); } - - __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_blendv_ps(f, t, m); - } - -#endif - - template<int mask> - __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { - return _mm256_blend_ps(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } - __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } - - __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } - __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } - - __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } - __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } - - __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } - __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } - - __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } - __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } - - __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } - __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } - - __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } - __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } - __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } - __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } - __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } - __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } - -#if defined(__AVX512VL__) - static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } -#else - static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } - static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } -#endif - - __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { - return madd(t,b-a,a); - } - - __forceinline bool isvalid (const vfloat8& v) { - return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); - } - - __forceinline bool is_finite (const vfloat8& a) { - return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); - } - - __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { - return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if !defined(__aarch64__) - __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } - __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } - __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } -#else - __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); } - __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); } -#endif - - - __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } - __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } - - template<int i> - __forceinline vfloat8 shuffle(const vfloat8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template<int i0, int i1> - __forceinline vfloat8 shuffle4(const vfloat8& v) { - return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { - return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vfloat8 shuffle(const vfloat8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { - return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } - -#if !defined(__aarch64__) - template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } - template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } - template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } -#endif - - __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } - template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } - template<size_t i> __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } - template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } - - __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } - - __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); } - -#if defined (__AVX2__) && !defined(__aarch64__) - __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { - return _mm256_permutevar8x32_ps(a, index); - } -#endif - -#if defined(__AVX512VL__) - template<int i> - static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { - return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); - } -#endif - -#if defined (__AVX_I__) - template<const int mode> - static __forceinline vint4 convert_to_hf16(const vfloat8& a) { - return _mm256_cvtps_ph(a, mode); - } - - static __forceinline vfloat8 convert_from_hf16(const vint4& a) { - return _mm256_cvtph_ps(a); - } -#endif - - __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) { - return vfloat4::broadcast(&a[k]); - } - - __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) { - return vfloat8::broadcast(&a[k]); - } - -#if defined(__AVX512VL__) - static __forceinline vfloat8 shift_right_1(const vfloat8& x) { - return align_shift_right<1>(zero,x); - } -#else - static __forceinline vfloat8 shift_right_1(const vfloat8& x) { - const vfloat8 t0 = shuffle<1,2,3,0>(x); - const vfloat8 t1 = shuffle4<1,0>(t0); - return _mm256_blend_ps(t0,t1,0x88); - } -#endif - - __forceinline vint8 floori(const vfloat8& a) { - return vint8(floor(a)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) - { - vfloat8 l02 = unpacklo(r0,r2); - vfloat8 h02 = unpackhi(r0,r2); - vfloat8 l13 = unpacklo(r1,r3); - vfloat8 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - c3 = unpackhi(h02,h13); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) - { - vfloat8 l02 = unpacklo(r0,r2); - vfloat8 h02 = unpackhi(r0,r2); - vfloat8 l13 = unpacklo(r1,r3); - vfloat8 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) - { - vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); - vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); - c0 = shuffle4<0,2>(h0,h4); - c1 = shuffle4<0,2>(h1,h5); - c2 = shuffle4<0,2>(h2,h6); - c3 = shuffle4<0,2>(h3,h7); - c4 = shuffle4<1,3>(h0,h4); - c5 = shuffle4<1,3>(h1,h5); - c6 = shuffle4<1,3>(h2,h6); - c7 = shuffle4<1,3>(h3,h7); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) - { - transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2) - { - transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if !defined(__aarch64__) - __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } - __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } -#else - __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); } - __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); } - __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); } - __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); } - __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); } - -#endif - __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) - { - const vfloat8 a = select(valid,v,vfloat8(pos_inf)); - const vbool8 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - - __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) - { - const vfloat8 a = select(valid,v,vfloat8(neg_inf)); - const vbool8 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators (pairs of Vec3fa's) - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { - // return vreduce_add4(a*b); - //} - - __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { - return _mm256_dp_ps(a,b,0x7F); - } - - __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) - { - const vfloat8 a0 = a; - const vfloat8 b0 = shuffle<1,2,0,3>(b); - const vfloat8 a1 = shuffle<1,2,0,3>(a); - const vfloat8 b1 = b; - return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); - } - - //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } - //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } - //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } - //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } - __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } - //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } - //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } - //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } - //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } - - //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { - // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - //} - - //////////////////////////////////////////////////////////////////////////////// - /// In Register Sorting - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 sort_ascending(const vfloat8& v) - { - const vfloat8 a0 = v; - const vfloat8 b0 = shuffle<1,0,3,2>(a0); - const vfloat8 c0 = min(a0,b0); - const vfloat8 d0 = max(a0,b0); - const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vfloat8 b1 = shuffle<2,3,0,1>(a1); - const vfloat8 c1 = min(a1,b1); - const vfloat8 d1 = max(a1,b1); - const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vfloat8 b2 = shuffle<1,0,3,2>(a2); - const vfloat8 c2 = min(a2,b2); - const vfloat8 d2 = max(a2,b2); - const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vfloat8 b3 = shuffle4<1,0>(a3); - const vfloat8 c3 = min(a3,b3); - const vfloat8 d3 = max(a3,b3); - const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vfloat8 b4 = shuffle<2,3,0,1>(a4); - const vfloat8 c4 = min(a4,b4); - const vfloat8 d4 = max(a4,b4); - const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vfloat8 b5 = shuffle<1,0,3,2>(a5); - const vfloat8 c5 = min(a5,b5); - const vfloat8 d5 = max(a5,b5); - const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - __forceinline vfloat8 sort_descending(const vfloat8& v) - { - const vfloat8 a0 = v; - const vfloat8 b0 = shuffle<1,0,3,2>(a0); - const vfloat8 c0 = max(a0,b0); - const vfloat8 d0 = min(a0,b0); - const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vfloat8 b1 = shuffle<2,3,0,1>(a1); - const vfloat8 c1 = max(a1,b1); - const vfloat8 d1 = min(a1,b1); - const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vfloat8 b2 = shuffle<1,0,3,2>(a2); - const vfloat8 c2 = max(a2,b2); - const vfloat8 d2 = min(a2,b2); - const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vfloat8 b3 = shuffle4<1,0>(a3); - const vfloat8 c3 = max(a3,b3); - const vfloat8 d3 = min(a3,b3); - const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vfloat8 b4 = shuffle<2,3,0,1>(a4); - const vfloat8 c4 = max(a4,b4); - const vfloat8 d4 = min(a4,b4); - const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vfloat8 b5 = shuffle<1,0,3,2>(a5); - const vfloat8 c5 = max(a5,b5); - const vfloat8 d5 = min(a5,b5); - const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h deleted file mode 100644 index 3249bc2b45..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h +++ /dev/null @@ -1,490 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 integer type */ - template<> - struct vint<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512i v; - int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint16& t) { v = t.v; } - __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } - - __forceinline vint(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vint(int i) { - v = _mm512_set1_epi32(i); - } - - __forceinline vint(int a, int b, int c, int d) { - v = _mm512_set4_epi32(d,c,b,a); - } - - __forceinline vint(int a0 , int a1 , int a2 , int a3, - int a4 , int a5 , int a6 , int a7, - int a8 , int a9 , int a10, int a11, - int a12, int a13, int a14, int a15) - { - v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline vint(const vint4& i) { - v = _mm512_broadcast_i32x4(i); - } - - __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { - v = _mm512_castsi128_si512(a); - v = _mm512_inserti32x4(v, b, 1); - v = _mm512_inserti32x4(v, c, 2); - v = _mm512_inserti32x4(v, d, 3); - } - - __forceinline vint(const vint8& i) { - v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); - } - - __forceinline vint(const vint8& a, const vint8& b) { - v = _mm512_castsi256_si512(a); - v = _mm512_inserti64x4(v, b, 1); - } - - __forceinline explicit vint(const __m512& f) { - v = _mm512_cvtps_epi32(f); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} - __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} - __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } - - static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } - - static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } - static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } - - static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } - - static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } - static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } - - static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } - static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } - - static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } - - static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) { - _mm512_mask_compressstoreu_epi32(addr,mask,reg); - } - - static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) { - //_mm512_mask_compressstoreu_epi32(addr,mask,reg); - *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); - } - - static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { - return _mm512_mask_compress_epi32(v,mask,v); - } - - static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { - return _mm512_mask_compress_epi32(a,mask,b); - } - - static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { - return _mm512_mask_expand_epi32(b,mask,a); - } - - template<int scale = 4> - static __forceinline vint16 gather(const int* ptr, const vint16& index) { - return _mm512_i32gather_epi32(index,ptr,scale); - } - - template<int scale = 4> - static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); - } - - template<int scale = 4> - static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); - } - - template<int scale = 4> - static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { - _mm512_i32scatter_epi32((int*)ptr,index,v,scale); - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { - _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); - } - - static __forceinline vint16 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } - __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } - - __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } - __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } - - __forceinline vint16 operator +(const vint16& a) { return a; } - __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } - __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } - __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } - - __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } - __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } - __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } - - __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } - __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } - __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } - - __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } - __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } - __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } - - __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } - __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } - __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } - - __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } - __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } - __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } - - __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } - __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } - - __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } - __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } - - __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } - __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } - __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } - - __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } - __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } - __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } - - __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } - __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } - __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } - - __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } - __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } - - __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } - __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } - - __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } - __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } - __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } - - __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } - __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } - - __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } - __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } - - __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } - __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } - - __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } - __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } - - __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } - __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } - __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } - - __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } - __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } - - __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } - __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } - - __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } - __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } - - __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } - __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } - - __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } - __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } - - __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - - __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } - - - __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { - return _mm512_mask_or_epi32(f,m,t,t); - } - - __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) { - const vint16 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) { - return _mm512_mask_test_epi32_mask(m,a,b); - } - - __forceinline vboolf16 test(const vint16& a, const vint16& b) { - return _mm512_test_epi32_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } - __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } - - template<int i> - __forceinline vint16 shuffle(const vint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint16 shuffle(const vint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i> - __forceinline vint16 shuffle4(const vint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint16 shuffle4(const vint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i> - __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { - return _mm512_alignr_epi32(a, b, i); - }; - - __forceinline int toScalar(const vint16& v) { - return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); - } - - template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } - - __forceinline size_t extract64bit(const vint16& v) { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - template<int N, int i> - vint<N> extractN(const vint16& v); - - template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } - template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } - template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } - template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } - - template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } - template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } - - template<int i> __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } - template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } - - template<int i> __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } - template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } - - __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } - - __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } - __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } - __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 conflict(const vint16& index) - { - return _mm512_conflict_epi32(index); - } - - __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) - { - return _mm512_mask_conflict_epi32(dest,mask,index); - } - - __forceinline vint16 convert_uint32_t(const __m512& f) { - return _mm512_cvtps_epu32(f); - } - - __forceinline vint16 permute(vint16 v, vint16 index) { - return _mm512_permutexvar_epi32(index,v); - } - - __forceinline vint16 reverse(const vint16 &a) { - return permute(a,vint16(reverse_step)); - } - - __forceinline vint16 prefix_sum(const vint16& a) - { - const vint16 z(zero); - vint16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vint16 reverse_prefix_sum(const vint16& a) - { - const vint16 z(zero); - vint16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - /* this should use a vbool8 and a vint8_64...*/ - template<int scale = 1, int hint = _MM_HINT_T0> - __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) - { -#if defined(__AVX512PF__) - _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h deleted file mode 100644 index 96f105a7c5..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h +++ /dev/null @@ -1,681 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -namespace embree -{ - /* 4-wide SSE integer type */ - template<> - struct vint<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128i v; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint4& a) { v = a.v; } - __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } - - __forceinline vint(__m128i a) : v(a) {} - __forceinline operator const __m128i&() const { return v; } - __forceinline operator __m128i&() { return v; } - - __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} - - __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} -#if defined(__AVX512VL__) - __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} -#else - __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} -#endif - - __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} - __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} - __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} - __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} - - __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } - __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} - - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } - static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } - - static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { - return _mm_mask_compress_epi32(v, mask, v); - } - static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { - return _mm_mask_compress_epi32(a, mask, b); - } - - static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } - static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } -#else - static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } - static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } -#endif - - -#if defined(__aarch64__) - static __forceinline vint4 load(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } - static __forceinline vint4 loadu(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } -#elif defined(__SSE4_1__) - static __forceinline vint4 load(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - - static __forceinline vint4 loadu(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } -#else - - static __forceinline vint4 load(const uint8_t* ptr) { - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); - } - - static __forceinline vint4 loadu(const uint8_t* ptr) { - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); - } - -#endif - - static __forceinline vint4 load(const unsigned short* ptr) { -#if defined(__aarch64__) - return __m128i(vmovl_u16(vld1_u16(ptr))); -#elif defined (__SSE4_1__) - return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); -#else - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); -#endif - } - - static __forceinline void store(uint8_t* ptr, const vint4& v) { -#if defined(__aarch64__) - int32x4_t x = v; - uint16x4_t y = vqmovn_u32(uint32x4_t(x)); - uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); - vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0); -#elif defined(__SSE4_1__) - __m128i x = v; - x = _mm_packus_epi32(x, x); - x = _mm_packus_epi16(x, x); - *(int*)ptr = _mm_cvtsi128_si32(x); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (uint8_t)v[i]; -#endif - } - - static __forceinline void store(unsigned short* ptr, const vint4& v) { -#if defined(__aarch64__) - uint32x4_t x = uint32x4_t(v.v); - uint16x4_t y = vqmovn_u32(x); - vst1_u16(ptr, y); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (unsigned short)v[i]; -#endif - } - - static __forceinline vint4 load_nt(void* ptr) { -#if defined(__aarch64__) || defined(__SSE4_1__) - return _mm_stream_load_si128((__m128i*)ptr); -#else - return _mm_load_si128((__m128i*)ptr); -#endif - } - - static __forceinline void store_nt(void* ptr, const vint4& v) { -#if !defined(__aarch64__) && defined(__SSE4_1__) - _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); -#else - _mm_store_si128((__m128i*)ptr,v); -#endif - } - - template<int scale = 4> - static __forceinline vint4 gather(const int* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_epi32(ptr, index, scale); -#else - return vint4( - *(int*)(((int8_t*)ptr)+scale*index[0]), - *(int*)(((int8_t*)ptr)+scale*index[1]), - *(int*)(((int8_t*)ptr)+scale*index[2]), - *(int*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template<int scale = 4> - static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { - vint4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) - { -#if defined(__AVX512VL__) - _mm_i32scatter_epi32((int*)ptr, index, v, scale); -#else - *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) - { -#if defined(__AVX512VL__) - _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); -#else - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - -#if defined(__x86_64__) || defined(__aarch64__) - static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } - - friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); -#elif defined(__aarch64__) - return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v)); -#elif defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); -#endif - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } -#else - __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } -#endif - - __forceinline vint4 operator +(const vint4& a) { return a; } - __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } -#if defined(__aarch64__) - __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); } -#elif defined(__SSSE3__) - __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } - __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } - __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } - - __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } - __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } - __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } -#else - __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } -#endif - __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } - __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } - - __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } - __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } - __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } - - __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } - __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } - __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } - - __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } - __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } - __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } - - __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); } - __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); } - - __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } - __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } - __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } - __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } - - __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } - __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } - __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } -#endif - - __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } - __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } - - __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } - __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } - - __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } - __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } - __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } - __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } - __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } - __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } -#endif - - __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } - __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } - - __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } - __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } - - __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } - __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } - - __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } - __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } - - __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } - __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } - - __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } - __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } - - __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } - __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } - __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } - __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } - __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } - __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } - __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } - __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } - __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } - __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } -#endif - - template<int mask> - __forceinline vint4 select(const vint4& t, const vint4& f) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); -#else - return select(vboolf4(mask), t, f); -#endif - } - - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } - __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } - - __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } - __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } - -#else - __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } - __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } -#endif - - __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } - __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } - __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } - __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - -#if defined(__aarch64__) - template<int i0, int i1, int i2, int i3> - __forceinline vint4 shuffle(const vint4& v) { - return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template<int i0, int i1, int i2, int i3> - __forceinline vint4 shuffle(const vint4& a, const vint4& b) { - return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template<int i0, int i1, int i2, int i3> - __forceinline vint4 shuffle(const vint4& v) { - return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - template<int i0, int i1, int i2, int i3> - __forceinline vint4 shuffle(const vint4& a, const vint4& b) { - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } -#endif -#if defined(__SSE3__) - template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } -#endif - - template<int i> - __forceinline vint4 shuffle(const vint4& v) { - return shuffle<i,i,i,i>(v); - } - -#if defined(__aarch64__) - template<int src> __forceinline int extract(const vint4& b); - template<int dst> __forceinline vint4 insert(const vint4& a, const int b); -#elif defined(__SSE4_1__) - template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } - template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } -#else - template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; } - template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - template<> __forceinline int extract<0>(const vint4& b) { - return b.v[0]; - } - template<> __forceinline int extract<1>(const vint4& b) { - return b.v[1]; - } - template<> __forceinline int extract<2>(const vint4& b) { - return b.v[2]; - } - template<> __forceinline int extract<3>(const vint4& b) { - return b.v[3]; - } - template<> __forceinline vint4 insert<0>(const vint4& a, int b) - { - vint4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vint4 insert<1>(const vint4& a, int b) - { - vint4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vint4 insert<2>(const vint4& a, int b) - { - vint4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vint4 insert<3>(const vint4& a, int b) - { - vint4 c = a; - c[3] = b; - return c; - } - - __forceinline int toScalar(const vint4& v) { - return v[0]; - } - - __forceinline size_t toSizeT(const vint4& v) { - uint64x2_t x = uint64x2_t(v.v); - return x[0]; - } -#else - template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } - - __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } - - __forceinline size_t toSizeT(const vint4& v) { -#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround - return toScalar(v); -#elif defined(__ARM_NEON) - // FIXME(LTE): Do we need a swap(i.e. use lane 1)? - return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0); -#else - return _mm_cvtsi128_si64(v); -#endif - } -#endif - -#if defined(__AVX512VL__) - - __forceinline vint4 permute(const vint4 &a, const vint4 &index) { - return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); - } - - template<int i> - __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { - return _mm_alignr_epi32(a, b, i); - } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) || defined(__SSE4_1__) - -#if defined(__aarch64__) - __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); } - __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); } - __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); } - - __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); } - __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); } - __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); } -#else - __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } - - __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } -#endif - - __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - -#else - - __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } - __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } - __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - - __forceinline vint4 usort_ascending(const vint4& v) - { - const vint4 a0 = v; - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = umin(a0,b0); - const vint4 d0 = umax(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = umin(a1,b1); - const vint4 d1 = umax(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = umin(a2,b2); - const vint4 d2 = umax(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - __forceinline vint4 usort_descending(const vint4& v) - { - const vint4 a0 = v; - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = umax(a0,b0); - const vint4 d0 = umin(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = umax(a1,b1); - const vint4 d1 = umin(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = umax(a2,b2); - const vint4 d2 = umin(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - -#else - - __forceinline vint4 usort_ascending(const vint4& v) - { - const vint4 a0 = v-vint4(0x80000000); - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = min(a0,b0); - const vint4 d0 = max(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = min(a1,b1); - const vint4 d1 = max(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = min(a2,b2); - const vint4 d2 = max(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3+vint4(0x80000000); - } - - __forceinline vint4 usort_descending(const vint4& v) - { - const vint4 a0 = v-vint4(0x80000000); - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = max(a0,b0); - const vint4 d0 = min(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = max(a1,b1); - const vint4 d1 = min(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = max(a2,b2); - const vint4 d2 = min(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3+vint4(0x80000000); - } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} - diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h deleted file mode 100644 index 25a771284d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint8_avx.h +++ /dev/null @@ -1,464 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - struct { __m128i vl,vh; }; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint8& a) { v = a.v; } - __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } - - __forceinline vint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} - - __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} - - __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} - __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} - __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} - __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } - static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } - - static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - - static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - -#if !defined(__aarch64__) - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } -#else - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } -#endif - - static __forceinline void store_nt(void* ptr, const vint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline vint8 load(const uint8_t* ptr) { - vint4 il = vint4::load(ptr+0); - vint4 ih = vint4::load(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 loadu(const uint8_t* ptr) { - vint4 il = vint4::loadu(ptr+0); - vint4 ih = vint4::loadu(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 load(const unsigned short* ptr) { - vint4 il = vint4::load(ptr+0); - vint4 ih = vint4::load(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 loadu(const unsigned short* ptr) { - vint4 il = vint4::loadu(ptr+0); - vint4 ih = vint4::loadu(ptr+4); - return vint8(il,ih); - } - - static __forceinline void store(uint8_t* ptr, const vint8& i) { - vint4 il(i.vl); - vint4 ih(i.vh); - vint4::store(ptr + 0,il); - vint4::store(ptr + 4,ih); - } - - static __forceinline void store(unsigned short* ptr, const vint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template<int scale = 4> - static __forceinline vint8 gather(const int* ptr, const vint8& index) { - return vint8( - *(int*)(((int8_t*)ptr)+scale*index[0]), - *(int*)(((int8_t*)ptr)+scale*index[1]), - *(int*)(((int8_t*)ptr)+scale*index[2]), - *(int*)(((int8_t*)ptr)+scale*index[3]), - *(int*)(((int8_t*)ptr)+scale*index[4]), - *(int*)(((int8_t*)ptr)+scale*index[5]), - *(int*)(((int8_t*)ptr)+scale*index[6]), - *(int*)(((int8_t*)ptr)+scale*index[7])); - } - - template<int scale = 4> - static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { - vint8 r = zero; - if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]); - return r; - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) - { - *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) - { - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - - static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } - - __forceinline vint8 operator +(const vint8& a) { return a; } - __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } - __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } - __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } - __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } - - __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } - __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } - __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } - - __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } - __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } - __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } - - __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } - __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } - - __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } - __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } - - __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } - __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } - - __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } - __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } - - __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } - __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } - __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } - - __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } - __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } - __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } - - __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } - __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } - __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } - - __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } - __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } - __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } - - __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } - __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } - __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } - __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } - - __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } - __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } - - __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } - __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } - - __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } - __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } - - __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } - __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } - - __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } - __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } - __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } - - __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } - __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } - __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } - - __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } - __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } - - __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } - __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } - __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } - - __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } - __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } - - __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } - __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } - __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } - - __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } - __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } - __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } - __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } - __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } - __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } - - __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } - __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } - __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } - __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } - __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } - - __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } - - __forceinline vint8 notand(const vboolf8& m, const vint8& f) { - return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - - template<int i> - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1> - __forceinline vint8 shuffle4(const vint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint8 shuffle(const vint8& a, const vint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } - template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 usort_ascending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umin(a0,b0); - const vint8 d0 = umax(a0,b0); - const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umin(a1,b1); - const vint8 d1 = umax(a1,b1); - const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umin(a2,b2); - const vint8 d2 = umax(a2,b2); - const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umin(a3,b3); - const vint8 d3 = umax(a3,b3); - const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umin(a4,b4); - const vint8 d4 = umax(a4,b4); - const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umin(a5,b5); - const vint8 d5 = umax(a5,b5); - const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); - return a6; - } - - __forceinline vint8 usort_descending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umax(a0,b0); - const vint8 d0 = umin(a0,b0); - const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umax(a1,b1); - const vint8 d1 = umin(a1,b1); - const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umax(a2,b2); - const vint8 d2 = umin(a2,b2); - const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umax(a3,b3); - const vint8 d3 = umin(a3,b3); - const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umax(a4,b4); - const vint8 d4 = umin(a4,b4); - const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umax(a5,b5); - const vint8 d5 = umin(a5,b5); - const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h deleted file mode 100644 index 4937d972cf..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h +++ /dev/null @@ -1,512 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint8& a) { v = a.v; } - __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } - - __forceinline vint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - - __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} -#else - __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} - __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} - __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } - - static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } - static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } - - static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } - -#if defined(__AVX512VL__) - - static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { - return _mm256_mask_compress_epi32(v, mask, v); - } - static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { - return _mm256_mask_compress_epi32(a, mask, b); - } - - static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } -#else - static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } -#endif - - static __forceinline vint8 load_nt(void* ptr) { - return _mm256_stream_load_si256((__m256i*)ptr); - } - - static __forceinline void store_nt(void* ptr, const vint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline void store(uint8_t* ptr, const vint8& i) - { - for (size_t j=0; j<8; j++) - ptr[j] = i[j]; - } - - static __forceinline void store(unsigned short* ptr, const vint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template<int scale = 4> - static __forceinline vint8 gather(const int *const ptr, const vint8& index) { - return _mm256_i32gather_epi32(ptr, index, scale); - } - - template<int scale = 4> - static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { - vint8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#else - return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); -#endif - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); -#else - *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } -#else - static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } -#endif - - __forceinline vint8 operator +(const vint8& a) { return a; } - __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } - __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } - __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } - __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } - - __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } - __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } - __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } - - __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } - __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } - __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } - - __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } - __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } - __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } - - __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } - __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } - __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } - - __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } - __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } - __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } - - __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } - __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } - - __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } - __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } - - __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } - __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } - __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } - - __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } - __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } - __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } - - __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } - __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } - __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } - - __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } - __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } - __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } - - __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } - __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } - __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } - - __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } - __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } - - __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } - __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } - - __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } - __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } - - __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } - __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } - - __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } - __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - - static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); - } -#else - static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } - static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } - static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } - static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } - static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } - static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } - - static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } -#endif - - template<int mask> - __forceinline vint8 select(const vint8& t, const vint8& f) { - return _mm256_blend_epi32(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } - __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } - - __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } - __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } - - __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } - __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } - - __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } - __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } - - __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } - __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } - - __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } - __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } - - __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } - __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } - __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } - __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } - __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } - __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } - -#if defined(__AVX512VL__) - static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } - static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } - __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } - - template<int i> - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1> - __forceinline vint8 shuffle4(const vint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vint8 shuffle(const vint8& a, const vint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - - template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } - template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - -#if !defined(__aarch64__) - -__forceinline vint8 permute(const vint8& v, const __m256i& index) { - return _mm256_permutevar8x32_epi32(v, index); - } - - __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { - return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); - } - - - - template<int i> - static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { -#if defined(__AVX512VL__) - return _mm256_alignr_epi32(a, b, i); -#else - return _mm256_alignr_epi8(a, b, 4*i); -#endif - } - -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - - __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 usort_ascending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umin(a0,b0); - const vint8 d0 = umax(a0,b0); - const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umin(a1,b1); - const vint8 d1 = umax(a1,b1); - const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umin(a2,b2); - const vint8 d2 = umax(a2,b2); - const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umin(a3,b3); - const vint8 d3 = umax(a3,b3); - const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umin(a4,b4); - const vint8 d4 = umax(a4,b4); - const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umin(a5,b5); - const vint8 d5 = umax(a5,b5); - const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - __forceinline vint8 usort_descending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umax(a0,b0); - const vint8 d0 = umin(a0,b0); - const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umax(a1,b1); - const vint8 d1 = umin(a1,b1); - const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umax(a2,b2); - const vint8 d2 = umin(a2,b2); - const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umax(a3,b3); - const vint8 d3 = umin(a3,b3); - const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umax(a4,b4); - const vint8 d4 = umin(a4,b4); - const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umax(a5,b5); - const vint8 d5 = umin(a5,b5); - const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h deleted file mode 100644 index de3ebc16a7..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h +++ /dev/null @@ -1,358 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX2 64-bit long long type */ - template<> - struct vllong<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256i v; - long long i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong() {} - __forceinline vllong(const vllong4& t) { v = t.v; } - __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } - - __forceinline vllong(const __m256i& t) { v = t; } - __forceinline operator __m256i() const { return v; } - __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } - - - __forceinline vllong(long long i) { - v = _mm256_set1_epi64x(i); - } - - __forceinline vllong(long long a, long long b, long long c, long long d) { - v = _mm256_set_epi64x(d,c,b,a); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} - __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} - __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); - } - - static __forceinline vllong4 loadu(const void* addr) - { - return _mm256_loadu_si256((__m256i*)addr); - } - - static __forceinline vllong4 load(const vllong4* addr) { - return _mm256_load_si256((__m256i*)addr); - } - - static __forceinline vllong4 load(const long long* addr) { - return _mm256_load_si256((__m256i*)addr); - } - - static __forceinline void store(void* ptr, const vllong4& v) { - _mm256_store_si256((__m256i*)ptr,v); - } - - static __forceinline void storeu(void* ptr, const vllong4& v) { - _mm256_storeu_si256((__m256i*)ptr,v); - } - - static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { -#if defined(__AVX512VL__) - _mm256_mask_storeu_epi64(ptr,mask,f); -#else - _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); -#endif - } - - static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { -#if defined(__AVX512VL__) - _mm256_mask_store_epi64(ptr,mask,f); -#else - _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); -#endif - } - - static __forceinline vllong4 broadcast64bit(size_t v) { - return _mm256_set1_epi64x(v); - } - - static __forceinline size_t extract64bit(const vllong4& v) - { - return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } - __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { - #if defined(__AVX512VL__) - return _mm256_mask_blend_epi64(m, f, t); - #else - return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); - #endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } -#else - __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } -#endif - - __forceinline vllong4 operator +(const vllong4& a) { return a; } - __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } - __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } - __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } - - __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } - __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } - __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } - - /* only low 32bit part */ - __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } - __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } - __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } - - __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } - __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } - __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } - - __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } - __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } - __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } - - __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } - __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } - __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } - - __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } - //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } - - __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } - //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } - //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } - - __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } - - //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } - //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } - //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } - - //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } - //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } - //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } - -#if defined(__AVX512VL__) - __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } - __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } -#else - __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } - __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } - __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } - - __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } - __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } - - __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } - __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } - - __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } - __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } - - __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } - __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } - - __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } - //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } - __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } - __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } - __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } - __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } - __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } -#endif - - __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } - __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } - - __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } - __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } - - __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } - __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } - - __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } - __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } - - __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } - __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } - - __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } - __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } - - __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } - __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } - __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } - __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } - __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } - __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } - __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } - __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } - __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } - __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } - __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } -#endif - - __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) { - const vllong4 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold4 test(const vllong4& a, const vllong4& b) { -#if defined(__AVX512VL__) - return _mm256_test_epi64_mask(a,b); -#else - return _mm256_testz_si256(a,b); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template<int i0, int i1> - __forceinline vllong4 shuffle(const vllong4& v) { - return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); - } - - template<int i> - __forceinline vllong4 shuffle(const vllong4& v) { - return shuffle<i, i>(v); - } - - template<int i0, int i1> - __forceinline vllong4 shuffle2(const vllong4& v) { - return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); - } - - __forceinline long long toScalar(const vllong4& v) { - return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); - } - -#if defined(__AVX512VL__) - __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { - // workaround for GCC 7.x -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) - return _mm256_permutex2var_epi64(a,index,a); -#else - return _mm256_permutexvar_epi64(index,a); -#endif - } - - __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { - return _mm256_permutex2var_epi64(a,index,b); - } - -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - - __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } - __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } - - __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } - __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } - - __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } - __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } - - __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } - __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } - __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<4; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h deleted file mode 100644 index 76dddd8991..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 64-bit long long type */ - template<> - struct vllong<8> - { - ALIGNED_STRUCT_(64); - - typedef vboold8 Bool; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m512i v; - long long i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong() {} - __forceinline vllong(const vllong8& t) { v = t.v; } - __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } - - __forceinline vllong(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vllong(long long i) { - v = _mm512_set1_epi64(i); - } - - __forceinline vllong(long long a, long long b, long long c, long long d) { - v = _mm512_set4_epi64(d,c,b,a); - } - - __forceinline vllong(long long a0, long long a1, long long a2, long long a3, - long long a4, long long a5, long long a6, long long a7) - { - v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline vllong(const vllong<4>& i) { - v = _mm512_broadcast_i64x4(i); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} - __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} - __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { - _mm512_stream_si512((__m512i*)ptr,a); - } - - static __forceinline vllong8 loadu(const void* addr) { - return _mm512_loadu_si512(addr); - } - - static __forceinline vllong8 load(const vllong8* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vllong8 load(const long long* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vllong8 load(const uint8_t* ptr) { - return _mm512_cvtepu8_epi64(*(__m128i*)ptr); - } - - static __forceinline void store(void* ptr, const vllong8& v) { - _mm512_store_si512(ptr,v); - } - - static __forceinline void storeu(void* ptr, const vllong8& v) { - _mm512_storeu_si512(ptr,v); - } - - static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { - _mm512_mask_storeu_epi64(ptr,mask,f); - } - - static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { - _mm512_mask_store_epi64(addr,mask,v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) { - _mm512_mask_compressstoreu_epi64(addr,mask,reg); - } - - static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) { - return _mm512_mask_compress_epi64(dest,mask,source); - } - - static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { - return _mm512_mask_compress_epi64(a,mask,b); - } - - static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { - return _mm512_mask_expand_epi64(b,mask,a); - } - - static __forceinline vllong8 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - static __forceinline size_t extract64bit(const vllong8& v) - { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } - __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } - - __forceinline vllong8 operator +(const vllong8& a) { return a; } - __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } - __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } - __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } - - __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } - __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } - __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } - - __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } - __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } - __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } - - __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } - __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } - __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } - - __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } - __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } - __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } - - __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } - __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } - __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } - - __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } - __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } - - __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } - __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } - - __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } - __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } - __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } - - __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } - __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } - __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } - - __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } - __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } - __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } - - __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } - __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } - - __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } - __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } - __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } - - __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } - __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } - - __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } - __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } - - __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } - __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } - - __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } - __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } - - __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } - __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } - __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } - - __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } - __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } - - __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } - __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } - - __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } - __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } - - __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } - __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } - - __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } - __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } - - __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { - return _mm512_mask_or_epi64(f,m,t,t); - } - - __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) { - const vllong8 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) { - return _mm512_mask_test_epi64_mask(m,a,b); - } - - __forceinline vboold8 test(const vllong8& a, const vllong8& b) { - return _mm512_test_epi64_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template<int i0, int i1> - __forceinline vllong8 shuffle(const vllong8& v) { - return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); - } - - template<int i> - __forceinline vllong8 shuffle(const vllong8& v) { - return shuffle<i, i>(v); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vllong8 shuffle(const vllong8& v) { - return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<int i0, int i1> - __forceinline vllong8 shuffle4(const vllong8& v) { - return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); - } - - template<int i> - __forceinline vllong8 shuffle4(const vllong8& v) { - return shuffle4<i, i>(v); - } - - template<int i> - __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { - return _mm512_alignr_epi64(a, b, i); - }; - - __forceinline long long toScalar(const vllong8& v) { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - __forceinline vllong8 zeroExtend32Bit(const __m512i& a) { - return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } - - __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } - - __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } - - __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } - - __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } - - __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } - __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } - __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } - __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } - __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { - return _mm512_permutexvar_epi64(index,v); - } - - __forceinline vllong8 reverse(const vllong8& a) { - return permute(a,vllong8(reverse_step)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<8; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h deleted file mode 100644 index 39752611bb..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 unsigned integer type */ - template<> - struct vuint<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vuint16 UInt; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512i v; - unsigned int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint16& t) { v = t.v; } - __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } - - __forceinline vuint(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vuint(unsigned int i) { - v = _mm512_set1_epi32(i); - } - - __forceinline vuint(const vuint4& i) { - v = _mm512_broadcast_i32x4(i); - } - - __forceinline vuint(const vuint8& i) { - v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); - } - - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { - v = _mm512_set4_epi32(d,c,b,a); - } - - __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, - unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, - unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, - unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) - { - v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline explicit vuint(const __m512& f) { - v = _mm512_cvtps_epu32(f); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} - __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { - _mm512_stream_si512((__m512i*)ptr,a); - } - - static __forceinline vuint16 loadu(const void* addr) - { - return _mm512_loadu_si512(addr); - } - - static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } - static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } - - static __forceinline vuint16 load(const vuint16* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vuint16 load(const unsigned int* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } - - - static __forceinline void store(void* ptr, const vuint16& v) { - _mm512_store_si512(ptr,v); - } - - static __forceinline void storeu(void* ptr, const vuint16& v) { - _mm512_storeu_si512(ptr,v); - } - - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { - _mm512_mask_storeu_epi32(ptr,mask,f); - } - - static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { - _mm512_mask_store_epi32(addr,mask,v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) { - _mm512_mask_compressstoreu_epi32(addr,mask,reg); - } - - static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) { - //_mm512_mask_compressstoreu_epi32(addr,mask,reg); - *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); - } - - static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { - return _mm512_mask_compress_epi32(v,mask,v); - } - - static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { - return _mm512_mask_compress_epi32(a,mask,b); - } - - static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { - return _mm512_mask_expand_epi32(b,mask,a); - } - - template<int scale = 4> - static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { - return _mm512_i32gather_epi32(index,ptr,scale); - } - - template<int scale = 4> - static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); - } - - template<int scale = 4> - static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); - } - - template<int scale = 4> - static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { - _mm512_i32scatter_epi32((int*)ptr,index,v,scale); - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { - _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); - } - - static __forceinline vuint16 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - static __forceinline size_t extract64bit(const vuint16& v) - { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } - - __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } - __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } - - __forceinline vuint16 operator +(const vuint16& a) { return a; } - __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } - __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } - __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } - - __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } - __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } - __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } - - __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } - __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } - __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } - - __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } - __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } - __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } - - __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } - __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } - __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } - - __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } - __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } - __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } - - __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } - __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } - - __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } - __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } - - __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } - __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } - __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } - - __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } - __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } - __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } - - __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } - __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } - __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } - - __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } - __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } - - __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } - __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } - __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } - - __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } - __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } - - __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } - __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } - - __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } - __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } - - __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } - __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } - - __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } - __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } - __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } - - __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } - __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } - - __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } - __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } - - __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } - __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } - - __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } - __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } - - __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } - __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } - - __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } - - - __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { - return _mm512_mask_or_epi32(f,m,t,t); - } - - __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) { - const vuint16 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) { - return _mm512_mask_test_epi32_mask(m,a,b); - } - - __forceinline vboolf16 test(const vuint16& a, const vuint16& b) { - return _mm512_test_epi32_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template<int i> - __forceinline vuint16 shuffle(const vuint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint16 shuffle(const vuint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i> - __forceinline vuint16 shuffle4(const vuint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint16 shuffle4(const vuint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i> - __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { - return _mm512_alignr_epi32(a, b, i); - }; - - __forceinline unsigned int toScalar(const vuint16& v) { - return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } - - __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } - - __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } - __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } - __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } - __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } - __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 permute(vuint16 v, vuint16 index) { - return _mm512_permutexvar_epi32(index,v); - } - - __forceinline vuint16 reverse(const vuint16& a) { - return permute(a,vuint16(reverse_step)); - } - - __forceinline vuint16 prefix_sum(const vuint16& a) - { - const vuint16 z(zero); - vuint16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vuint16 reverse_prefix_sum(const vuint16& a) - { - const vuint16 z(zero); - vuint16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h deleted file mode 100644 index a3f393ebf2..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h +++ /dev/null @@ -1,499 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -namespace embree -{ - /* 4-wide SSE integer type */ - template<> - struct vuint<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vuint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128i v; unsigned int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint4& a) { v = a.v; } - __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } - - __forceinline vuint(const __m128i a) : v(a) {} - __forceinline operator const __m128i&() const { return v; } - __forceinline operator __m128i&() { return v; } - - - __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} -#endif - -#if defined(__AVX512VL__) - __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} -#else - __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} - __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} - __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} - __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } - __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } - static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } - - static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } - -#if defined(__AVX512VL__) - static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } - static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } -#else - static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } - static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } -#endif - -#if defined(__aarch64__) - static __forceinline vuint4 load(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } - static __forceinline vuint4 loadu(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } -#elif defined(__SSE4_1__) - static __forceinline vuint4 load(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - - static __forceinline vuint4 loadu(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - -#endif - - static __forceinline vuint4 load(const unsigned short* ptr) { -#if defined(__aarch64__) - return _mm_load4epu16_epi32(((__m128i*)ptr)); -#elif defined (__SSE4_1__) - return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); -#else - return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); -#endif - } - - static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) { -#if defined(__aarch64__) - uint32x4_t x = uint32x4_t(v.v); - uint16x4_t y = vqmovn_u32(x); - uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); - vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0); -#elif defined(__SSE4_1__) - __m128i x = v; - x = _mm_packus_epi32(x, x); - x = _mm_packus_epi16(x, x); - *(unsigned*)ptr = _mm_cvtsi128_si32(x); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (uint8_t)v[i]; -#endif - } - - static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) { -#if defined(__aarch64__) - uint32x4_t x = (uint32x4_t)v.v; - uint16x4_t y = vqmovn_u32(x); - vst1_u16(ptr, y); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (unsigned short)v[i]; -#endif - } - - static __forceinline vuint4 load_nt(void* ptr) { -#if (defined(__aarch64__)) || defined(__SSE4_1__) - return _mm_stream_load_si128((__m128i*)ptr); -#else - return _mm_load_si128((__m128i*)ptr); -#endif - } - - static __forceinline void store_nt(void* ptr, const vuint4& v) { -#if !defined(__aarch64__) && defined(__SSE4_1__) - _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); -#else - _mm_store_si128((__m128i*)ptr,v); -#endif - } - - template<int scale = 4> - static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_epi32((const int*)ptr, index, scale); -#else - return vuint4( - *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template<int scale = 4> - static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { - vuint4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } - - friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); -#elif defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); -#endif - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } -#else - __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } -#endif - - __forceinline vuint4 operator +(const vuint4& a) { return a; } - __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } - __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } - __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } - - __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } - __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } - __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } - -//#if defined(__SSE4_1__) -// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } -//#else -// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } -//#endif -// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } -// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } - - __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } - __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } - __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } - - __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } - __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } - __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } - - __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } - __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } - __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } - - __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } - __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } - - __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } - __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } - __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } - __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } - - __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } - __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } - -//#if defined(__SSE4_1__) -// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } -// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } -//#endif - - __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } - __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } - - __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } - __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } - - __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } - __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } - //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } - //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } - //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } - //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } -#endif - - __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } - __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } - - __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } - __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } - - //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } - //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } - - //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } - //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } - - //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } - //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } - - //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } - //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } - - __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } - __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } - //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } - //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } - //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } - //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } - //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } - //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } - //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } - //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } - //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } - //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } - //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } - //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } -#endif - - template<int mask> - __forceinline vuint4 select(const vuint4& t, const vuint4& f) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); -#else - return select(vboolf4(mask), t, f); -#endif - } - -/*#if defined(__SSE4_1__) - __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } - __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } - -#else - __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } - __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } -#endif - - __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } - __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } - __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } - __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - -#if defined(__aarch64__) - template<int i0, int i1, int i2, int i3> - __forceinline vuint4 shuffle(const vuint4& v) { - return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template<int i0, int i1, int i2, int i3> - __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { - return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template<int i0, int i1, int i2, int i3> - __forceinline vuint4 shuffle(const vuint4& v) { - return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - template<int i0, int i1, int i2, int i3> - __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } -#endif -#if defined(__SSE3__) - template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } -#endif - - template<int i> - __forceinline vuint4 shuffle(const vuint4& v) { - return shuffle<i,i,i,i>(v); - } - -#if defined(__aarch64__) - template<int src> __forceinline unsigned int extract(const vuint4& b); - template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b); -#elif defined(__SSE4_1__) - template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } - template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } -#else - template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } - template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - template<> __forceinline unsigned int extract<0>(const vuint4& b) { - return b[0]; - } - template<> __forceinline unsigned int extract<1>(const vuint4& b) { - return b[1]; - } - template<> __forceinline unsigned int extract<2>(const vuint4& b) { - return b[2]; - } - template<> __forceinline unsigned int extract<3>(const vuint4& b) { - return b[3]; - } - - template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[3] = b; - return c; - } - - __forceinline unsigned int toScalar(const vuint4& v) { - return v[0]; - } -#else - template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } - - __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - -#if 0 -#if defined(__SSE4_1__) - - __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } - - __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } - __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } - __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - -#else - - __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } - __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } - __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } - -#endif -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} - diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h deleted file mode 100644 index d4e86ae92d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vuint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vuint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - struct { __m128i vl,vh; }; - unsigned int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint8& a) { v = a.v; } - __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } - - __forceinline vuint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} - - __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} - - __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} - __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } - static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } - - static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - - static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - -#if !defined(__aarch64__) - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } -#else - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } -#endif - static __forceinline void store_nt(void* ptr, const vuint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline vuint8 load(const uint8_t* ptr) { - vuint4 il = vuint4::load(ptr+0); - vuint4 ih = vuint4::load(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 loadu(const uint8_t* ptr) { - vuint4 il = vuint4::loadu(ptr+0); - vuint4 ih = vuint4::loadu(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 load(const unsigned short* ptr) { - vuint4 il = vuint4::load(ptr+0); - vuint4 ih = vuint4::load(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 loadu(const unsigned short* ptr) { - vuint4 il = vuint4::loadu(ptr+0); - vuint4 ih = vuint4::loadu(ptr+4); - return vuint8(il,ih); - } - - static __forceinline void store(uint8_t* ptr, const vuint8& i) { - vuint4 il(i.vl); - vuint4 ih(i.vh); - vuint4::store(ptr + 0,il); - vuint4::store(ptr + 4,ih); - } - - static __forceinline void store(unsigned short* ptr, const vuint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template<int scale = 4> - static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { - return vuint8( - *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[3]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[4]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[5]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[6]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[7])); - } - - template<int scale = 4> - static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { - vuint8 r = zero; - if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]); - return r; - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) - { - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) - { - if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - - static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } - - __forceinline vuint8 operator +(const vuint8& a) { return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } - __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } - __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } - - __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } - __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } - __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } - - //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } - //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } - //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } - - __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } - __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } - - __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } - __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } - - __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } - __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } - - __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } - __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } - - __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } - __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } - __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } - - __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } - __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } - __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } - - __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } - __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } - __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } - __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } - - __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } - __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } - - //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } - //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } - - __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } - __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } - - __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } - __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } - - __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } - __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } - __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } - - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } - __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } - __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } - - //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), - // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } - //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } - //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } - - //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } - //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } - //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } - - //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), - // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } - //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } - //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } - - //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } - //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } - //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } - - __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } - __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } - - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } - - __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) { - return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - - template<int i> - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1> - __forceinline vuint8 shuffle4(const vuint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } - template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } - //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } - - //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } - //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h deleted file mode 100644 index b2a965448d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h +++ /dev/null @@ -1,439 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vuint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vuint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - unsigned int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint8& a) { v = a.v; } - __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } - - __forceinline vuint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - - __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} -#else - __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} - __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} - __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } - - static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } - static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } - - static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } - -#if defined(__AVX512VL__) - - static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { - return _mm256_mask_compress_epi32(v, mask, v); - } - static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { - return _mm256_mask_compress_epi32(a, mask, b); - } - - static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } -#else - static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } -#endif - - static __forceinline vuint8 load_nt(void* ptr) { - return _mm256_stream_load_si256((__m256i*)ptr); - } - - static __forceinline void store_nt(void* ptr, const vuint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline void store(uint8_t* ptr, const vuint8& i) - { - for (size_t j=0; j<8; j++) - ptr[j] = i[j]; - } - - static __forceinline void store(unsigned short* ptr, const vuint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template<int scale = 4> - static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { - return _mm256_i32gather_epi32((const int*) ptr, index, scale); - } - - template<int scale = 4> - static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { - vuint8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); -#else - return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); -#endif - } - - template<int scale = 4> - static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); -#else - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7]; -#endif - } - - template<int scale = 4> - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } -#else - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } -#endif - - __forceinline vuint8 operator +(const vuint8& a) { return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } - __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } - __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } - - __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } - __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } - __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } - - //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } - //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } - //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } - - __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } - __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } - __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } - - __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } - __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } - __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } - - __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } - __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } - __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } - - __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } - __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } - - __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } - __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } - - __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } - __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } - __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } - - __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } - __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } - __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } - - __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } - __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } - __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } - - __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } - __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } - __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } - __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } - - __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } - __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } - - //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } - //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } - - __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } - __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } - - __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } - __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } - - __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } - __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); - } -#else - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } - //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } - //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } - //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } - //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } -#endif - - template<int mask> - __forceinline vuint8 select(const vuint8& t, const vuint8& f) { - return _mm256_blend_epi32(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } - __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } - - __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } - __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } - - //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } - //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } - - //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } - //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } - - //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } - //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } - - //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } - //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } - - __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } - __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } - //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } - //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } - //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } - //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } - //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } - //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } - //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } - //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } - __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } - - template<int i> - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template<int i0, int i1> - __forceinline vuint8 shuffle4(const vuint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1> - __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<int i0, int i1, int i2, int i3> - __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - - template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } - template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - -#if !defined(__aarch64__) - - __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { - return _mm256_permutevar8x32_epi32(v, index); - } - - __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { - return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); - } - - template<int i> - __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { -#if defined(__AVX512VL__) - return _mm256_alignr_epi32(a, b, i); -#else - return _mm256_alignr_epi8(a, b, 4*i); -#endif - } - -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } - //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } - - //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } - //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp deleted file mode 100644 index 12f143f131..0000000000 --- a/thirdparty/embree-aarch64/common/sys/alloc.cpp +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "alloc.h" -#include "intrinsics.h" -#include "sysinfo.h" -#include "mutex.h" - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -namespace embree -{ - void* alignedMalloc(size_t size, size_t align) - { - if (size == 0) - return nullptr; - - assert((align & (align-1)) == 0); - void* ptr = _mm_malloc(size,align); - - if (size != 0 && ptr == nullptr) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return ptr; - } - - void alignedFree(void* ptr) - { - if (ptr) - _mm_free(ptr); - } - - static bool huge_pages_enabled = false; - static MutexSys os_init_mutex; - - __forceinline bool isHugePageCandidate(const size_t bytes) - { - if (!huge_pages_enabled) - return false; - - /* use huge pages only when memory overhead is low */ - const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); - return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead - } -} - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#ifdef _WIN32 - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> -#include <malloc.h> - -namespace embree -{ - bool win_enable_selockmemoryprivilege (bool verbose) - { - HANDLE hToken; - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { - if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; - return false; - } - - TOKEN_PRIVILEGES tp; - tp.PrivilegeCount = 1; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - - if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { - if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; - return false; - } - - SetLastError(ERROR_SUCCESS); - if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { - if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; - return false; - } - - if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { - if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; - return false; - } - - return true; - } - - bool os_init(bool hugepages, bool verbose) - { - Lock<MutexSys> lock(os_init_mutex); - - if (!hugepages) { - huge_pages_enabled = false; - return true; - } - - if (GetLargePageMinimum() != PAGE_SIZE_2M) { - huge_pages_enabled = false; - return false; - } - - huge_pages_enabled = true; - return true; - } - - void* os_malloc(size_t bytes, bool& hugepages) - { - if (bytes == 0) { - hugepages = false; - return nullptr; - } - - /* try direct huge page allocation first */ - if (isHugePageCandidate(bytes)) - { - int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; - char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); - if (ptr != nullptr) { - hugepages = true; - return ptr; - } - } - - /* fall back to 4k pages */ - int flags = MEM_COMMIT | MEM_RESERVE; - char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); - // -- GODOT start -- - // if (ptr == nullptr) throw std::bad_alloc(); - if (ptr == nullptr) abort(); - // -- GODOT end -- - hugepages = false; - return ptr; - } - - size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) - { - if (hugepages) // decommitting huge pages seems not to work under Windows - return bytesOld; - - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); - bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); - if (bytesNew >= bytesOld) - return bytesOld; - - if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return bytesNew; - } - - void os_free(void* ptr, size_t bytes, bool hugepages) - { - if (bytes == 0) - return; - - if (!VirtualFree(ptr,0,MEM_RELEASE)) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - } - - void os_advise(void *ptr, size_t bytes) - { - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include <sys/mman.h> -#include <errno.h> -#include <stdlib.h> -#include <string.h> -#include <sstream> - -#if defined(__MACOSX__) -#include <mach/vm_statistics.h> -#endif - -namespace embree -{ - bool os_init(bool hugepages, bool verbose) - { - Lock<MutexSys> lock(os_init_mutex); - - if (!hugepages) { - huge_pages_enabled = false; - return true; - } - -#if defined(__LINUX__) - - int hugepagesize = 0; - - std::ifstream file; - file.open("/proc/meminfo",std::ios::in); - if (!file.is_open()) { - if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; - huge_pages_enabled = false; - return false; - } - - std::string line; - while (getline(file,line)) - { - std::stringstream sline(line); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string tag; getline(sline,tag,' '); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string val; getline(sline,val,' '); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string unit; getline(sline,unit,' '); - if (tag == "Hugepagesize:" && unit == "kB") { - hugepagesize = std::stoi(val)*1024; - break; - } - } - - if (hugepagesize != PAGE_SIZE_2M) - { - if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; - huge_pages_enabled = false; - return false; - } -#endif - - huge_pages_enabled = true; - return true; - } - - void* os_malloc(size_t bytes, bool& hugepages) - { - if (bytes == 0) { - hugepages = false; - return nullptr; - } - - /* try direct huge page allocation first */ - if (isHugePageCandidate(bytes)) - { -#if defined(__MACOSX__) - void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); - if (ptr != MAP_FAILED) { - hugepages = true; - return ptr; - } -#elif defined(MAP_HUGETLB) - void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); - if (ptr != MAP_FAILED) { - hugepages = true; - return ptr; - } -#endif - } - - /* fallback to 4k pages */ - void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); - // -- GODOT start -- - // if (ptr == MAP_FAILED) throw std::bad_alloc(); - if (ptr == MAP_FAILED) abort(); - // -- GODOT end -- - hugepages = false; - - /* advise huge page hint for THP */ - os_advise(ptr,bytes); - return ptr; - } - - size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) - { - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); - bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); - if (bytesNew >= bytesOld) - return bytesOld; - - if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return bytesNew; - } - - void os_free(void* ptr, size_t bytes, bool hugepages) - { - if (bytes == 0) - return; - - /* for hugepages we need to also align the size */ - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytes = (bytes+pageSize-1) & ~(pageSize-1); - if (munmap(ptr,bytes) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - } - - /* hint for transparent huge pages (THP) */ - void os_advise(void* pptr, size_t bytes) - { -#if defined(MADV_HUGEPAGE) - madvise(pptr,bytes,MADV_HUGEPAGE); -#endif - } -} - -#endif diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h deleted file mode 100644 index 5898ecda70..0000000000 --- a/thirdparty/embree-aarch64/common/sys/alloc.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include <vector> -#include <set> - -namespace embree -{ -#define ALIGNED_STRUCT_(align) \ - void* operator new(size_t size) { return alignedMalloc(size,align); } \ - void operator delete(void* ptr) { alignedFree(ptr); } \ - void* operator new[](size_t size) { return alignedMalloc(size,align); } \ - void operator delete[](void* ptr) { alignedFree(ptr); } - -#define ALIGNED_CLASS_(align) \ - public: \ - ALIGNED_STRUCT_(align) \ - private: - - /*! aligned allocation */ - void* alignedMalloc(size_t size, size_t align); - void alignedFree(void* ptr); - - /*! allocator that performs aligned allocations */ - template<typename T, size_t alignment> - struct aligned_allocator - { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - __forceinline pointer allocate( size_type n ) { - return (pointer) alignedMalloc(n*sizeof(value_type),alignment); - } - - __forceinline void deallocate( pointer p, size_type n ) { - return alignedFree(p); - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); - } - - __forceinline void destroy( pointer p ) { - p->~T(); - } - }; - - /*! allocates pages directly from OS */ - bool win_enable_selockmemoryprivilege(bool verbose); - bool os_init(bool hugepages, bool verbose); - void* os_malloc (size_t bytes, bool& hugepages); - size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); - void os_free (void* ptr, size_t bytes, bool hugepages); - void os_advise (void* ptr, size_t bytes); - - /*! allocator that performs OS allocations */ - template<typename T> - struct os_allocator - { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - __forceinline os_allocator () - : hugepages(false) {} - - __forceinline pointer allocate( size_type n ) { - return (pointer) os_malloc(n*sizeof(value_type),hugepages); - } - - __forceinline void deallocate( pointer p, size_type n ) { - return os_free(p,n*sizeof(value_type),hugepages); - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); - } - - __forceinline void destroy( pointer p ) { - p->~T(); - } - - bool hugepages; - }; - - /*! allocator for IDs */ - template<typename T, size_t max_id> - struct IDPool - { - typedef T value_type; - - IDPool () - : nextID(0) {} - - T allocate() - { - /* return ID from list */ - if (!IDs.empty()) - { - T id = *IDs.begin(); - IDs.erase(IDs.begin()); - return id; - } - - /* allocate new ID */ - else - { - if (size_t(nextID)+1 > max_id) - return -1; - - return nextID++; - } - } - - /* adds an ID provided by the user */ - bool add(T id) - { - if (id > max_id) - return false; - - /* check if ID should be in IDs set */ - if (id < nextID) { - auto p = IDs.find(id); - if (p == IDs.end()) return false; - IDs.erase(p); - return true; - } - - /* otherwise increase ID set */ - else - { - for (T i=nextID; i<id; i++) { - IDs.insert(i); - } - nextID = id+1; - return true; - } - } - - void deallocate( T id ) - { - assert(id < nextID); - MAYBE_UNUSED auto done = IDs.insert(id).second; - assert(done); - } - - private: - std::set<T> IDs; //!< stores deallocated IDs to be reused - T nextID; //!< next ID to use when IDs vector is empty - }; -} - diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h deleted file mode 100644 index 77722a39f6..0000000000 --- a/thirdparty/embree-aarch64/common/sys/array.h +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "alloc.h" - -namespace embree -{ - /*! static array with static size */ - template<typename T, size_t N> - class array_t - { - public: - - /********************** Iterators ****************************/ - - __forceinline T* begin() const { return items; }; - __forceinline T* end () const { return items+N; }; - - - /********************** Capacity ****************************/ - - __forceinline bool empty () const { return N == 0; } - __forceinline size_t size () const { return N; } - __forceinline size_t max_size () const { return N; } - - - /******************** Element access **************************/ - - __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } - __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } - - __forceinline T& at(size_t i) { assert(i < N); return items[i]; } - __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } - - __forceinline T& front() const { assert(N > 0); return items[0]; }; - __forceinline T& back () const { assert(N > 0); return items[N-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - private: - T items[N]; - }; - - /*! static array with dynamic size */ - template<typename T, size_t N> - class darray_t - { - public: - - __forceinline darray_t () : M(0) {} - - __forceinline darray_t (const T& v) : M(0) { - for (size_t i=0; i<N; i++) items[i] = v; - } - - /********************** Iterators ****************************/ - - __forceinline T* begin() const { return items; }; - __forceinline T* end () const { return items+M; }; - - - /********************** Capacity ****************************/ - - __forceinline bool empty () const { return M == 0; } - __forceinline size_t size () const { return M; } - __forceinline size_t capacity () const { return N; } - __forceinline size_t max_size () const { return N; } - - void resize(size_t new_size) { - assert(new_size < max_size()); - M = new_size; - } - - /******************** Modifiers **************************/ - - __forceinline void push_back(const T& v) - { - assert(M+1 < max_size()); - items[M++] = v; - } - - __forceinline void pop_back() - { - assert(!empty()); - M--; - } - - __forceinline void clear() { - M = 0; - } - - /******************** Element access **************************/ - - __forceinline T& operator[](size_t i) { assert(i < M); return items[i]; } - __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; } - - __forceinline T& at(size_t i) { assert(i < M); return items[i]; } - __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; } - - __forceinline T& front() const { assert(M > 0); return items[0]; }; - __forceinline T& back () const { assert(M > 0); return items[M-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - private: - size_t M; - T items[N]; - }; - - /*! dynamic sized array that is allocated on the stack */ -#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N) - template<typename Ty, size_t max_stack_bytes> - struct __aligned(64) StackArray - { - __forceinline StackArray (const size_t N) - : N(N) - { - if (N*sizeof(Ty) <= max_stack_bytes) - data = &arr[0]; - else - data = (Ty*) alignedMalloc(N*sizeof(Ty),64); - } - - __forceinline ~StackArray () { - if (data != &arr[0]) alignedFree(data); - } - - __forceinline operator Ty* () { return data; } - __forceinline operator const Ty* () const { return data; } - - __forceinline Ty& operator[](const int i) { assert(i>=0 && i<N); return data[i]; } - __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; } - - __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } - __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } - __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } -#endif - - private: - Ty arr[max_stack_bytes/sizeof(Ty)]; - Ty* data; - size_t N; - - private: - StackArray (const StackArray& other) DELETED; // do not implement - StackArray& operator= (const StackArray& other) DELETED; // do not implement - - }; - - /*! dynamic sized array that is allocated on the stack */ - template<typename Ty, size_t max_stack_elements, size_t max_total_elements> - struct __aligned(64) DynamicStackArray - { - __forceinline DynamicStackArray () - : data(&arr[0]) {} - - __forceinline ~DynamicStackArray () - { - if (!isStackAllocated()) - delete[] data; - } - - __forceinline bool isStackAllocated() const { - return data == &arr[0]; - } - - __forceinline size_t size() const - { - if (isStackAllocated()) return max_stack_elements; - else return max_total_elements; - } - - __forceinline void resize(size_t M) - { - assert(M <= max_total_elements); - if (likely(M <= max_stack_elements)) return; - if (likely(!isStackAllocated())) return; - - data = new Ty[max_total_elements]; - - for (size_t i=0; i<max_stack_elements; i++) - data[i] = arr[i]; - } - - __forceinline operator Ty* () { return data; } - __forceinline operator const Ty* () const { return data; } - - __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } - __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } -#endif - - __forceinline DynamicStackArray (const DynamicStackArray& other) - : data(&arr[0]) - { - for (size_t i=0; i<other.size(); i++) - this->operator[] (i) = other[i]; - } - - DynamicStackArray& operator= (const DynamicStackArray& other) - { - for (size_t i=0; i<other.size(); i++) - this->operator[] (i) = other[i]; - - return *this; - } - - private: - Ty arr[max_stack_elements]; - Ty* data; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h deleted file mode 100644 index ebfb8552c3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/atomic.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include <atomic> -#include "intrinsics.h" - -namespace embree -{ -/* compiler memory barriers */ -#if defined(__INTEL_COMPILER) -//#define __memory_barrier() __memory_barrier() -#elif defined(__GNUC__) || defined(__clang__) -# define __memory_barrier() asm volatile("" ::: "memory") -#elif defined(_MSC_VER) -# define __memory_barrier() _ReadWriteBarrier() -#endif - - template <typename T> - struct atomic : public std::atomic<T> - { - atomic () {} - - atomic (const T& a) - : std::atomic<T>(a) {} - - atomic (const atomic<T>& a) { - this->store(a.load()); - } - - atomic& operator=(const atomic<T>& other) { - this->store(other.load()); - return *this; - } - }; - - template<typename T> - __forceinline void atomic_min(std::atomic<T>& aref, const T& bref) - { - const T b = bref.load(); - while (true) { - T a = aref.load(); - if (a <= b) break; - if (aref.compare_exchange_strong(a,b)) break; - } - } - - template<typename T> - __forceinline void atomic_max(std::atomic<T>& aref, const T& bref) - { - const T b = bref.load(); - while (true) { - T a = aref.load(); - if (a >= b) break; - if (aref.compare_exchange_strong(a,b)) break; - } - } -} diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp deleted file mode 100644 index 0061d18db2..0000000000 --- a/thirdparty/embree-aarch64/common/sys/barrier.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "barrier.h" -#include "condition.h" -#include "regression.h" -#include "thread.h" - -#if defined (__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -namespace embree -{ - struct BarrierSysImplementation - { - __forceinline BarrierSysImplementation (size_t N) - : i(0), enterCount(0), exitCount(0), barrierSize(0) - { - events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); - events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); - init(N); - } - - __forceinline ~BarrierSysImplementation () - { - CloseHandle(events[0]); - CloseHandle(events[1]); - } - - __forceinline void init(size_t N) - { - barrierSize = N; - enterCount.store(N); - exitCount.store(N); - } - - __forceinline void wait() - { - /* every thread entering the barrier decrements this count */ - size_t i0 = i; - size_t cnt0 = enterCount--; - - /* all threads except the last one are wait in the barrier */ - if (cnt0 > 1) - { - if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) - THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); - } - - /* the last thread starts all threads waiting at the barrier */ - else - { - i = 1-i; - enterCount.store(barrierSize); - if (SetEvent(events[i0]) == 0) - THROW_RUNTIME_ERROR("SetEvent failed"); - } - - /* every thread leaving the barrier decrements this count */ - size_t cnt1 = exitCount--; - - /* the last thread that left the barrier resets the event again */ - if (cnt1 == 1) - { - exitCount.store(barrierSize); - if (ResetEvent(events[i0]) == 0) - THROW_RUNTIME_ERROR("ResetEvent failed"); - } - } - - public: - HANDLE events[2]; - atomic<size_t> i; - atomic<size_t> enterCount; - atomic<size_t> exitCount; - size_t barrierSize; - }; -} - -#else - -namespace embree -{ - struct BarrierSysImplementation - { - __forceinline BarrierSysImplementation (size_t N) - : count(0), barrierSize(0) - { - init(N); - } - - __forceinline void init(size_t N) - { - assert(count == 0); - count = 0; - barrierSize = N; - } - - __forceinline void wait() - { - mutex.lock(); - count++; - - if (count == barrierSize) { - count = 0; - cond.notify_all(); - mutex.unlock(); - return; - } - - cond.wait(mutex); - mutex.unlock(); - return; - } - - public: - MutexSys mutex; - ConditionSys cond; - volatile size_t count; - volatile size_t barrierSize; - }; -} - -#endif - -namespace embree -{ - BarrierSys::BarrierSys (size_t N) { - opaque = new BarrierSysImplementation(N); - } - - BarrierSys::~BarrierSys () { - delete (BarrierSysImplementation*) opaque; - } - - void BarrierSys::init(size_t count) { - ((BarrierSysImplementation*) opaque)->init(count); - } - - void BarrierSys::wait() { - ((BarrierSysImplementation*) opaque)->wait(); - } - - LinearBarrierActive::LinearBarrierActive (size_t N) - : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) - { - if (N == 0) N = getNumberOfLogicalThreads(); - init(N); - } - - LinearBarrierActive::~LinearBarrierActive() - { - delete[] count0; - delete[] count1; - } - - void LinearBarrierActive::init(size_t N) - { - if (threadCount != N) { - threadCount = N; - if (count0) delete[] count0; count0 = new unsigned char[N]; - if (count1) delete[] count1; count1 = new unsigned char[N]; - } - mode = 0; - flag0 = 0; - flag1 = 0; - for (size_t i=0; i<N; i++) count0[i] = 0; - for (size_t i=0; i<N; i++) count1[i] = 0; - } - - void LinearBarrierActive::wait (const size_t threadIndex) - { - if (mode == 0) - { - if (threadIndex == 0) - { - for (size_t i=0; i<threadCount; i++) - count1[i] = 0; - - for (size_t i=1; i<threadCount; i++) - { - while (likely(count0[i] == 0)) - pause_cpu(); - } - mode = 1; - flag1 = 0; - __memory_barrier(); - flag0 = 1; - } - else - { - count0[threadIndex] = 1; - { - while (likely(flag0 == 0)) - pause_cpu(); - } - - } - } - else - { - if (threadIndex == 0) - { - for (size_t i=0; i<threadCount; i++) - count0[i] = 0; - - for (size_t i=1; i<threadCount; i++) - { - while (likely(count1[i] == 0)) - pause_cpu(); - } - - mode = 0; - flag0 = 0; - __memory_barrier(); - flag1 = 1; - } - else - { - count1[threadIndex] = 1; - { - while (likely(flag1 == 0)) - pause_cpu(); - } - } - } - } - - struct barrier_sys_regression_test : public RegressionTest - { - BarrierSys barrier; - std::atomic<size_t> threadID; - std::atomic<size_t> numFailed; - std::vector<size_t> threadResults; - - barrier_sys_regression_test() - : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) - { - registerRegressionTest(this); - } - - static void thread_alloc(barrier_sys_regression_test* This) - { - size_t tid = This->threadID++; - for (size_t j=0; j<1000; j++) - { - This->barrier.wait(); - This->threadResults[tid] = tid; - This->barrier.wait(); - } - } - - bool run () - { - threadID.store(0); - numFailed.store(0); - - size_t numThreads = getNumberOfLogicalThreads(); - threadResults.resize(numThreads); - barrier.init(numThreads+1); - - /* create threads */ - std::vector<thread_t> threads; - for (size_t i=0; i<numThreads; i++) - threads.push_back(createThread((thread_func)thread_alloc,this)); - - /* run test */ - for (size_t i=0; i<1000; i++) - { - for (size_t i=0; i<numThreads; i++) threadResults[i] = 0; - barrier.wait(); - barrier.wait(); - for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i; - } - - /* destroy threads */ - for (size_t i=0; i<numThreads; i++) - join(threads[i]); - - return numFailed == 0; - } - }; - - barrier_sys_regression_test barrier_sys_regression_test; -} - - diff --git a/thirdparty/embree-aarch64/common/sys/barrier.h b/thirdparty/embree-aarch64/common/sys/barrier.h deleted file mode 100644 index 89607b8685..0000000000 --- a/thirdparty/embree-aarch64/common/sys/barrier.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "intrinsics.h" -#include "sysinfo.h" -#include "atomic.h" - -namespace embree -{ - /*! system barrier using operating system */ - class BarrierSys - { - public: - - /*! construction / destruction */ - BarrierSys (size_t N = 0); - ~BarrierSys (); - - private: - /*! class in non-copyable */ - BarrierSys (const BarrierSys& other) DELETED; // do not implement - BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement - - public: - /*! intializes the barrier with some number of threads */ - void init(size_t count); - - /*! lets calling thread wait in barrier */ - void wait(); - - private: - void* opaque; - }; - - /*! fast active barrier using atomitc counter */ - struct BarrierActive - { - public: - BarrierActive () - : cntr(0) {} - - void reset() { - cntr.store(0); - } - - void wait (size_t numThreads) - { - cntr++; - while (cntr.load() != numThreads) - pause_cpu(); - } - - private: - std::atomic<size_t> cntr; - }; - - /*! fast active barrier that does not require initialization to some number of threads */ - struct BarrierActiveAutoReset - { - public: - BarrierActiveAutoReset () - : cntr0(0), cntr1(0) {} - - void wait (size_t threadCount) - { - cntr0.fetch_add(1); - while (cntr0 != threadCount) pause_cpu(); - cntr1.fetch_add(1); - while (cntr1 != threadCount) pause_cpu(); - cntr0.fetch_add(-1); - while (cntr0 != 0) pause_cpu(); - cntr1.fetch_add(-1); - while (cntr1 != 0) pause_cpu(); - } - - private: - std::atomic<size_t> cntr0; - std::atomic<size_t> cntr1; - }; - - class LinearBarrierActive - { - public: - - /*! construction and destruction */ - LinearBarrierActive (size_t threadCount = 0); - ~LinearBarrierActive(); - - private: - /*! class in non-copyable */ - LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement - LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement - - public: - /*! intializes the barrier with some number of threads */ - void init(size_t threadCount); - - /*! thread with threadIndex waits in the barrier */ - void wait (const size_t threadIndex); - - private: - volatile unsigned char* count0; - volatile unsigned char* count1; - volatile unsigned int mode; - volatile unsigned int flag0; - volatile unsigned int flag1; - volatile size_t threadCount; - }; -} - diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp deleted file mode 100644 index 0e7ca7af39..0000000000 --- a/thirdparty/embree-aarch64/common/sys/condition.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "condition.h" - -#if defined(__WIN32__) && !defined(PTHREADS_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -namespace embree -{ - struct ConditionImplementation - { - __forceinline ConditionImplementation () { - InitializeConditionVariable(&cond); - } - - __forceinline ~ConditionImplementation () { - } - - __forceinline void wait(MutexSys& mutex_in) { - SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); - } - - __forceinline void notify_all() { - WakeAllConditionVariable(&cond); - } - - public: - CONDITION_VARIABLE cond; - }; -} -#endif - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) -#include <pthread.h> -namespace embree -{ - struct ConditionImplementation - { - __forceinline ConditionImplementation () { - pthread_cond_init(&cond,nullptr); - } - - __forceinline ~ConditionImplementation() { - pthread_cond_destroy(&cond); - } - - __forceinline void wait(MutexSys& mutex) { - pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); - } - - __forceinline void notify_all() { - pthread_cond_broadcast(&cond); - } - - public: - pthread_cond_t cond; - }; -} -#endif - -namespace embree -{ - ConditionSys::ConditionSys () { - cond = new ConditionImplementation; - } - - ConditionSys::~ConditionSys() { - delete (ConditionImplementation*) cond; - } - - void ConditionSys::wait(MutexSys& mutex) { - ((ConditionImplementation*) cond)->wait(mutex); - } - - void ConditionSys::notify_all() { - ((ConditionImplementation*) cond)->notify_all(); - } -} diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h deleted file mode 100644 index 7a3a05aa81..0000000000 --- a/thirdparty/embree-aarch64/common/sys/condition.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "mutex.h" - -namespace embree -{ - class ConditionSys - { - public: - ConditionSys(); - ~ConditionSys(); - void wait( class MutexSys& mutex ); - void notify_all(); - - template<typename Predicate> - __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) - { - while (!pred()) wait(mutex); - } - - private: - ConditionSys (const ConditionSys& other) DELETED; // do not implement - ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement - - protected: - void* cond; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp deleted file mode 100644 index 86182c1afb..0000000000 --- a/thirdparty/embree-aarch64/common/sys/filename.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "filename.h" -#include "sysinfo.h" - -namespace embree -{ -#ifdef __WIN32__ - const char path_sep = '\\'; -#else - const char path_sep = '/'; -#endif - - /*! create an empty filename */ - FileName::FileName () {} - - /*! create a valid filename from a string */ - FileName::FileName (const char* in) { - filename = in; - for (size_t i=0; i<filename.size(); i++) - if (filename[i] == '\\' || filename[i] == '/') - filename[i] = path_sep; - while (!filename.empty() && filename[filename.size()-1] == path_sep) - filename.resize(filename.size()-1); - } - - /*! create a valid filename from a string */ - FileName::FileName (const std::string& in) { - filename = in; - for (size_t i=0; i<filename.size(); i++) - if (filename[i] == '\\' || filename[i] == '/') - filename[i] = path_sep; - while (!filename.empty() && filename[filename.size()-1] == path_sep) - filename.resize(filename.size()-1); - } - - /*! returns path to home folder */ - FileName FileName::homeFolder() - { -#ifdef __WIN32__ - const char* home = getenv("UserProfile"); -#else - const char* home = getenv("HOME"); -#endif - if (home) return home; - return ""; - } - - /*! returns path to executable */ - FileName FileName::executableFolder() { - return FileName(getExecutableFileName()).path(); - } - - /*! returns the path */ - FileName FileName::path() const { - size_t pos = filename.find_last_of(path_sep); - if (pos == std::string::npos) return FileName(); - return filename.substr(0,pos); - } - - /*! returns the basename */ - std::string FileName::base() const { - size_t pos = filename.find_last_of(path_sep); - if (pos == std::string::npos) return filename; - return filename.substr(pos+1); - } - - /*! returns the extension */ - std::string FileName::ext() const { - size_t pos = filename.find_last_of('.'); - if (pos == std::string::npos) return ""; - return filename.substr(pos+1); - } - - /*! returns the extension */ - FileName FileName::dropExt() const { - size_t pos = filename.find_last_of('.'); - if (pos == std::string::npos) return filename; - return filename.substr(0,pos); - } - - /*! returns the basename without extension */ - std::string FileName::name() const { - size_t start = filename.find_last_of(path_sep); - if (start == std::string::npos) start = 0; else start++; - size_t end = filename.find_last_of('.'); - if (end == std::string::npos || end < start) end = filename.size(); - return filename.substr(start, end - start); - } - - /*! replaces the extension */ - FileName FileName::setExt(const std::string& ext) const { - size_t start = filename.find_last_of(path_sep); - if (start == std::string::npos) start = 0; else start++; - size_t end = filename.find_last_of('.'); - if (end == std::string::npos || end < start) return FileName(filename+ext); - return FileName(filename.substr(0,end)+ext); - } - - /*! adds the extension */ - FileName FileName::addExt(const std::string& ext) const { - return FileName(filename+ext); - } - - /*! concatenates two filenames to this/other */ - FileName FileName::operator +( const FileName& other ) const { - if (filename == "") return FileName(other); - else return FileName(filename + path_sep + other.filename); - } - - /*! concatenates two filenames to this/other */ - FileName FileName::operator +( const std::string& other ) const { - return operator+(FileName(other)); - } - - /*! removes the base from a filename (if possible) */ - FileName FileName::operator -( const FileName& base ) const { - size_t pos = filename.find_first_of(base); - if (pos == std::string::npos) return *this; - return FileName(filename.substr(pos+1)); - } - - /*! == operator */ - bool operator== (const FileName& a, const FileName& b) { - return a.filename == b.filename; - } - - /*! != operator */ - bool operator!= (const FileName& a, const FileName& b) { - return a.filename != b.filename; - } - - /*! output operator */ - std::ostream& operator<<(std::ostream& cout, const FileName& filename) { - return cout << filename.filename; - } -} diff --git a/thirdparty/embree-aarch64/common/sys/filename.h b/thirdparty/embree-aarch64/common/sys/filename.h deleted file mode 100644 index 58f881b14d..0000000000 --- a/thirdparty/embree-aarch64/common/sys/filename.h +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -namespace embree -{ - /*! Convenience class for handling file names and paths. */ - class FileName - { - public: - - /*! create an empty filename */ - FileName (); - - /*! create a valid filename from a string */ - FileName (const char* filename); - - /*! create a valid filename from a string */ - FileName (const std::string& filename); - - /*! returns path to home folder */ - static FileName homeFolder(); - - /*! returns path to executable */ - static FileName executableFolder(); - - /*! auto convert into a string */ - operator std::string() const { return filename; } - - /*! returns a string of the filename */ - const std::string str() const { return filename; } - - /*! returns a c-string of the filename */ - const char* c_str() const { return filename.c_str(); } - - /*! returns the path of a filename */ - FileName path() const; - - /*! returns the file of a filename */ - std::string base() const; - - /*! returns the base of a filename without extension */ - std::string name() const; - - /*! returns the file extension */ - std::string ext() const; - - /*! drops the file extension */ - FileName dropExt() const; - - /*! replaces the file extension */ - FileName setExt(const std::string& ext = "") const; - - /*! adds file extension */ - FileName addExt(const std::string& ext = "") const; - - /*! concatenates two filenames to this/other */ - FileName operator +( const FileName& other ) const; - - /*! concatenates two filenames to this/other */ - FileName operator +( const std::string& other ) const; - - /*! removes the base from a filename (if possible) */ - FileName operator -( const FileName& base ) const; - - /*! == operator */ - friend bool operator==(const FileName& a, const FileName& b); - - /*! != operator */ - friend bool operator!=(const FileName& a, const FileName& b); - - /*! output operator */ - friend embree_ostream operator<<(embree_ostream cout, const FileName& filename); - - private: - std::string filename; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/intrinsics.h b/thirdparty/embree-aarch64/common/sys/intrinsics.h deleted file mode 100644 index 44cdbd8f0f..0000000000 --- a/thirdparty/embree-aarch64/common/sys/intrinsics.h +++ /dev/null @@ -1,559 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -#if defined(__WIN32__) -#include <intrin.h> -#endif - -#if defined(__ARM_NEON) -#include "../math/SSE2NEON.h" -#if defined(NEON_AVX2_EMULATION) -#include "../math/AVX2NEON.h" -#endif -#else -#include <immintrin.h> -#endif - -#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) - #if !defined(_tzcnt_u32) - #define _tzcnt_u32 __tzcnt_u32 - #endif - #if !defined(_tzcnt_u64) - #define _tzcnt_u64 __tzcnt_u64 - #endif -#endif - -#if defined(__aarch64__) -#if !defined(_lzcnt_u32) - #define _lzcnt_u32 __builtin_clz -#endif -#if !defined(_lzcnt_u32) - #define _lzcnt_u32 __builtin_clzll -#endif -#else -#if defined(__LZCNT__) - #if !defined(_lzcnt_u32) - #define _lzcnt_u32 __lzcnt32 - #endif - #if !defined(_lzcnt_u64) - #define _lzcnt_u64 __lzcnt64 - #endif -#endif -#endif - -#if defined(__WIN32__) -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include <windows.h> -#endif - -/* normally defined in pmmintrin.h, but we always need this */ -#if !defined(_MM_SET_DENORMALS_ZERO_MODE) -#define _MM_DENORMALS_ZERO_ON (0x0040) -#define _MM_DENORMALS_ZERO_OFF (0x0000) -#define _MM_DENORMALS_ZERO_MASK (0x0040) -#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) -#endif - -namespace embree -{ - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - - __forceinline size_t read_tsc() - { - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - return (size_t)li.QuadPart; - } - - __forceinline int bsf(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif - } - - __forceinline unsigned bsf(unsigned v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif - } - -#if defined(__X86_64__) - __forceinline size_t bsf(size_t v) { -#if defined(__AVX2__) - return _tzcnt_u64(v); -#else - unsigned long r = 0; _BitScanForward64(&r,v); return r; -#endif - } -#endif - - __forceinline int bscf(int& v) - { - int i = bsf(v); - v &= v-1; - return i; - } - - __forceinline unsigned bscf(unsigned& v) - { - unsigned i = bsf(v); - v &= v-1; - return i; - } - -#if defined(__X86_64__) - __forceinline size_t bscf(size_t& v) - { - size_t i = bsf(v); - v &= v-1; - return i; - } -#endif - - __forceinline int bsr(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#else - unsigned long r = 0; _BitScanReverse(&r,v); return r; -#endif - } - - __forceinline unsigned bsr(unsigned v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#else - unsigned long r = 0; _BitScanReverse(&r,v); return r; -#endif - } - -#if defined(__X86_64__) - __forceinline size_t bsr(size_t v) { -#if defined(__AVX2__) - return 63 -_lzcnt_u64(v); -#else - unsigned long r = 0; _BitScanReverse64(&r, v); return r; -#endif - } -#endif - - __forceinline int lzcnt(const int x) - { -#if defined(__AVX2__) && !defined(__aarch64__) - return _lzcnt_u32(x); -#else - if (unlikely(x == 0)) return 32; - return 31 - bsr(x); -#endif - } - - __forceinline int btc(int v, int i) { - long r = v; _bittestandcomplement(&r,i); return r; - } - - __forceinline int bts(int v, int i) { - long r = v; _bittestandset(&r,i); return r; - } - - __forceinline int btr(int v, int i) { - long r = v; _bittestandreset(&r,i); return r; - } - -#if defined(__X86_64__) - - __forceinline size_t btc(size_t v, size_t i) { - size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; - } - - __forceinline size_t bts(size_t v, size_t i) { - __int64 r = v; _bittestandset64(&r,i); return r; - } - - __forceinline size_t btr(size_t v, size_t i) { - __int64 r = v; _bittestandreset64(&r,i); return r; - } - -#endif - - __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { - return _InterlockedCompareExchange((volatile long*)p,v,c); - } - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#else - -#if defined(__i386__) && defined(__PIC__) - - __forceinline void __cpuid(int out[4], int op) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "0"(op)); - } - - __forceinline void __cpuid_count(int out[4], int op1, int op2) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) - : "0" (op1), "2" (op2)); - } - -#else - - __forceinline void __cpuid(int out[4], int op) { -#if defined(__ARM_NEON) - if (op == 0) { // Get CPU name - out[0] = 0x41524d20; - out[1] = 0x41524d20; - out[2] = 0x41524d20; - out[3] = 0x41524d20; - } -#else - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); -#endif - } - -#if !defined(__ARM_NEON) - __forceinline void __cpuid_count(int out[4], int op1, int op2) { - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); - } -#endif - -#endif - - __forceinline uint64_t read_tsc() { -#if defined(__ARM_NEON) - return 0; // FIXME(LTE): mimic rdtsc -#else - uint32_t high,low; - asm volatile ("rdtsc" : "=d"(high), "=a"(low)); - return (((uint64_t)high) << 32) + (uint64_t)low; -#endif - } - - __forceinline int bsf(int v) { -#if defined(__ARM_NEON) - return __builtin_ctz(v); -#else -#if defined(__AVX2__) - return _tzcnt_u32(v); -#else - int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif -#endif - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned bsf(unsigned v) - { -#if defined(__ARM_NEON) - return __builtin_ctz(v); -#else -#if defined(__AVX2__) - return _tzcnt_u32(v); -#else - unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif -#endif - } -#endif - - __forceinline size_t bsf(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__X86_64__) - return _tzcnt_u64(v); -#else - return _tzcnt_u32(v); -#endif -#elif defined(__ARM_NEON) - return __builtin_ctzl(v); -#else - size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - - __forceinline int bscf(int& v) - { - int i = bsf(v); - v &= v-1; - return i; - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned int bscf(unsigned int& v) - { - unsigned int i = bsf(v); - v &= v-1; - return i; - } -#endif - - __forceinline size_t bscf(size_t& v) - { - size_t i = bsf(v); - v &= v-1; - return i; - } - - __forceinline int bsr(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#elif defined(__ARM_NEON) - return __builtin_clz(v)^31; -#else - int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned bsr(unsigned v) { -#if defined(__AVX2__) - return 31 - _lzcnt_u32(v); -#elif defined(__ARM_NEON) - return __builtin_clz(v)^31; -#else - unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } -#endif - - __forceinline size_t bsr(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__X86_64__) - return 63 - _lzcnt_u64(v); -#else - return 31 - _lzcnt_u32(v); -#endif -#elif defined(__aarch64__) - return (sizeof(v) * 8 - 1) - __builtin_clzl(v); -#else - size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - - __forceinline int lzcnt(const int x) - { -#if defined(__AVX2__) && !defined(__aarch64__) - return _lzcnt_u32(x); -#else - if (unlikely(x == 0)) return 32; - return 31 - bsr(x); -#endif - } - - __forceinline size_t blsr(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__INTEL_COMPILER) - return _blsr_u64(v); -#else -#if defined(__X86_64__) - return __blsr_u64(v); -#else - return __blsr_u32(v); -#endif -#endif -#else - return v & (v-1); -#endif - } - - __forceinline int btc(int v, int i) { -#if defined(__aarch64__) - // _bittestandcomplement(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a ^ (1 << b); - // return x; - - // We only need `*a` - return (v ^ (1 << i)); -#else - int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; -#endif - } - - __forceinline int bts(int v, int i) { -#if defined(__aarch64__) - // _bittestandset(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a | (1 << b); - // return x; - return (v | (v << i)); -#else - int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline int btr(int v, int i) { -#if defined(__aarch64__) - // _bittestandreset(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a & ~(1 << b); - // return x; - return (v & ~(v << i)); -#else - int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline size_t btc(size_t v, size_t i) { -#if defined(__aarch64__) - return (v ^ (1 << i)); -#else - size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; -#endif - } - - __forceinline size_t bts(size_t v, size_t i) { -#if defined(__aarch64__) - return (v | (v << i)); -#else - size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline size_t btr(size_t v, size_t i) { -#if defined(__ARM_NEON) - return (v & ~(v << i)); -#else - size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { - return __sync_val_compare_and_swap(value, comparand, input); - } - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__clang__) || defined(__GNUC__) -#if !defined(_mm_undefined_ps) - __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } -#endif -#if !defined(_mm_undefined_si128) - __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } -#endif -#if !defined(_mm256_undefined_ps) && defined(__AVX__) - __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } -#endif -#if !defined(_mm256_undefined_si256) && defined(__AVX__) - __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } -#endif -#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) - __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } -#endif -#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) - __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } -#endif -#endif - -#if defined(__SSE4_2__) || defined(__ARM_NEON) - - __forceinline int popcnt(int in) { - return _mm_popcnt_u32(in); - } - - __forceinline unsigned popcnt(unsigned in) { - return _mm_popcnt_u32(in); - } - -#if defined(__X86_64__) || defined(__ARM_NEON) - __forceinline size_t popcnt(size_t in) { - return _mm_popcnt_u64(in); - } -#endif - -#endif - - __forceinline uint64_t rdtsc() - { - int dummy[4]; - __cpuid(dummy,0); - uint64_t clock = read_tsc(); - __cpuid(dummy,0); - return clock; - } - - __forceinline void pause_cpu(const size_t N = 8) - { - for (size_t i=0; i<N; i++) - _mm_pause(); - } - - /* prefetches */ - __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } - __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } - __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } - __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } - __forceinline void prefetchEX (const void* ptr) { -#if defined(__INTEL_COMPILER) - _mm_prefetch((const char*)ptr,_MM_HINT_ET0); -#else - _mm_prefetch((const char*)ptr,_MM_HINT_T0); -#endif - } - - __forceinline void prefetchL1EX(const void* ptr) { - prefetchEX(ptr); - } - - __forceinline void prefetchL2EX(const void* ptr) { - prefetchEX(ptr); - } -#if defined(__AVX2__) && !defined(__aarch64__) - __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } - __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } -#if defined(__X86_64__) - __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } - __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } -#endif -#endif - -#if defined(__AVX512F__) -#if defined(__INTEL_COMPILER) - __forceinline float mm512_cvtss_f32(__m512 v) { - return _mm512_cvtss_f32(v); - } - __forceinline int mm512_mask2int(__mmask16 k1) { - return _mm512_mask2int(k1); - } - __forceinline __mmask16 mm512_int2mask(int mask) { - return _mm512_int2mask(mask); - } -#else - __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 - return _mm_cvtss_f32(_mm512_castps512_ps128(v)); - } - __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 - return (int)k1; - } - __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 - return (__mmask16)mask; - } -#endif -#endif -} diff --git a/thirdparty/embree-aarch64/common/sys/library.cpp b/thirdparty/embree-aarch64/common/sys/library.cpp deleted file mode 100644 index 899267a1e4..0000000000 --- a/thirdparty/embree-aarch64/common/sys/library.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "library.h" -#include "sysinfo.h" -#include "filename.h" - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -namespace embree -{ - /* opens a shared library */ - lib_t openLibrary(const std::string& file) - { - std::string fullName = file+".dll"; - FileName executable = getExecutableFileName(); - HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); - return lib_t(handle); - } - - /* returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym) { - return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str())); - } - - /* closes the shared library */ - void closeLibrary(lib_t lib) { - FreeLibrary(HMODULE(lib)); - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include <dlfcn.h> - -namespace embree -{ - /* opens a shared library */ - lib_t openLibrary(const std::string& file) - { -#if defined(__MACOSX__) - std::string fullName = "lib"+file+".dylib"; -#else - std::string fullName = "lib"+file+".so"; -#endif - void* lib = dlopen(fullName.c_str(), RTLD_NOW); - if (lib) return lib_t(lib); - FileName executable = getExecutableFileName(); - lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); - if (lib == nullptr) { - const char* error = dlerror(); - if (error) { - THROW_RUNTIME_ERROR(error); - } else { - THROW_RUNTIME_ERROR("could not load library "+executable.str()); - } - } - return lib_t(lib); - } - - /* returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym) { - return dlsym(lib,sym.c_str()); - } - - /* closes the shared library */ - void closeLibrary(lib_t lib) { - dlclose(lib); - } -} -#endif diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h deleted file mode 100644 index c2164e9fbe..0000000000 --- a/thirdparty/embree-aarch64/common/sys/library.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -namespace embree -{ - /*! type for shared library */ - typedef struct opaque_lib_t* lib_t; - - /*! loads a shared library */ - lib_t openLibrary(const std::string& file); - - /*! returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym); - - /*! unloads a shared library */ - void closeLibrary(lib_t lib); -} diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp deleted file mode 100644 index 11779bc9b9..0000000000 --- a/thirdparty/embree-aarch64/common/sys/mutex.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "mutex.h" -#include "regression.h" - -#if defined(__WIN32__) && !defined(PTHREADS_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -namespace embree -{ - MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } - MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } - void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } - bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } - void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } -} -#endif - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) -#include <pthread.h> -namespace embree -{ - /*! system mutex using pthreads */ - MutexSys::MutexSys() - { - mutex = new pthread_mutex_t; - if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_init failed"); - } - - MutexSys::~MutexSys() - { - MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; - assert(ok); - delete (pthread_mutex_t*)mutex; - mutex = nullptr; - } - - void MutexSys::lock() - { - if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); - } - - bool MutexSys::try_lock() { - return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; - } - - void MutexSys::unlock() - { - if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); - } -}; -#endif diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h deleted file mode 100644 index 1164210f23..0000000000 --- a/thirdparty/embree-aarch64/common/sys/mutex.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "intrinsics.h" -#include "atomic.h" - -namespace embree -{ - /*! system mutex */ - class MutexSys { - friend struct ConditionImplementation; - public: - MutexSys(); - ~MutexSys(); - - private: - MutexSys (const MutexSys& other) DELETED; // do not implement - MutexSys& operator= (const MutexSys& other) DELETED; // do not implement - - public: - void lock(); - bool try_lock(); - void unlock(); - - protected: - void* mutex; - }; - - /*! spinning mutex */ - class SpinLock - { - public: - - SpinLock () - : flag(false) {} - - __forceinline bool isLocked() { - return flag.load(); - } - - __forceinline void lock() - { - while (true) - { - while (flag.load()) - { - _mm_pause(); - _mm_pause(); - } - - bool expected = false; - if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) - break; - } - } - - __forceinline bool try_lock() - { - bool expected = false; - if (flag.load() != expected) { - return false; - } - return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); - } - - __forceinline void unlock() { - flag.store(false,std::memory_order_release); - } - - __forceinline void wait_until_unlocked() - { - while(flag.load()) - { - _mm_pause(); - _mm_pause(); - } - } - - public: - atomic<bool> flag; - }; - - /*! safe mutex lock and unlock helper */ - template<typename Mutex> class Lock { - public: - Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } - Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} - ~Lock() { if (locked) mutex.unlock(); } - __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } - __forceinline bool isLocked() const { return locked; } - protected: - Mutex& mutex; - bool locked; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h deleted file mode 100644 index 737f14aa6e..0000000000 --- a/thirdparty/embree-aarch64/common/sys/platform.h +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define _CRT_SECURE_NO_WARNINGS - -#include <cstddef> -#include <cassert> -#include <cstdlib> -#include <cstdio> -#include <memory> -#include <stdexcept> -#include <iostream> -#include <iomanip> -#include <fstream> -#include <string> -#include <cstring> -#include <stdint.h> -#include <functional> - -//////////////////////////////////////////////////////////////////////////////// -/// detect platform -//////////////////////////////////////////////////////////////////////////////// - -/* detect 32 or 64 platform */ -#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -#define __X86_64__ -#endif - -/* detect Linux platform */ -#if defined(linux) || defined(__linux__) || defined(__LINUX__) -# if !defined(__LINUX__) -# define __LINUX__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect FreeBSD platform */ -#if defined(__FreeBSD__) || defined(__FREEBSD__) -# if !defined(__FREEBSD__) -# define __FREEBSD__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ -#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) -# if !defined(__WIN32__) -# define __WIN32__ -# endif -#endif - -/* detect Cygwin platform */ -#if defined(__CYGWIN__) -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect MAC OS X platform */ -#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) -# if !defined(__MACOSX__) -# define __MACOSX__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* try to detect other Unix systems */ -#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Macros -//////////////////////////////////////////////////////////////////////////////// - -#ifdef __WIN32__ -#define dll_export __declspec(dllexport) -#define dll_import __declspec(dllimport) -#else -#define dll_export __attribute__ ((visibility ("default"))) -#define dll_import -#endif - -#ifdef __WIN32__ -#if !defined(__noinline) -#define __noinline __declspec(noinline) -#endif -//#define __forceinline __forceinline -//#define __restrict __restrict -#if defined(__INTEL_COMPILER) -#define __restrict__ __restrict -#else -#define __restrict__ //__restrict // causes issues with MSVC -#endif -#if !defined(__thread) -// NOTE: Require `-fms-extensions` for clang -#define __thread __declspec(thread) -#endif -#if !defined(__aligned) -#if defined(__MINGW32__) -#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) -#else -#define __aligned(...) __declspec(align(__VA_ARGS__)) -#endif -#endif -//#define __FUNCTION__ __FUNCTION__ -#define debugbreak() __debugbreak() - -#else -#if !defined(__noinline) -#define __noinline __attribute__((noinline)) -#endif -#if !defined(__forceinline) -#define __forceinline inline __attribute__((always_inline)) -#endif -//#define __restrict __restrict -//#define __thread __thread -#if !defined(__aligned) -#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) -#endif -#if !defined(__FUNCTION__) -#define __FUNCTION__ __PRETTY_FUNCTION__ -#endif -#define debugbreak() asm ("int $3") -#endif - -#if defined(__clang__) || defined(__GNUC__) - #define MAYBE_UNUSED __attribute__((unused)) -#else - #define MAYBE_UNUSED -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly - #define DELETED -#else - #define DELETED = delete -#endif - -// -- GODOT start -- -#ifndef likely -// -- GODOT end -- -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -#define likely(expr) (expr) -#define unlikely(expr) (expr) -#else -#define likely(expr) __builtin_expect((bool)(expr),true ) -#define unlikely(expr) __builtin_expect((bool)(expr),false) -#endif -// -- GODOT start -- -#endif -// -- GODOT end -- - -//////////////////////////////////////////////////////////////////////////////// -/// Error handling and debugging -//////////////////////////////////////////////////////////////////////////////// - -/* debug printing macros */ -#define STRING(x) #x -#define TOSTRING(x) STRING(x) -#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl -#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl -#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl -#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl -#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl - -#if defined(DEBUG) // only report file and line in debug mode - // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) - // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); - #define THROW_RUNTIME_ERROR(str) \ - printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); - // -- GODOT end -- -#else - // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) - // throw std::runtime_error(str); - #define THROW_RUNTIME_ERROR(str) \ - abort(); - // -- GODOT end -- -#endif - -#define FATAL(x) THROW_RUNTIME_ERROR(x) -#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } - -#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") - -//////////////////////////////////////////////////////////////////////////////// -/// Basic types -//////////////////////////////////////////////////////////////////////////////// - -/* default floating-point type */ -namespace embree { - typedef float real; -} - -/* windows does not have ssize_t */ -#if defined(__WIN32__) -#if defined(__X86_64__) || defined(__aarch64__) -typedef int64_t ssize_t; -#else -typedef int32_t ssize_t; -#endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Basic utility functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline std::string toString(long long value) { - return std::to_string(value); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Disable some compiler warnings -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__INTEL_COMPILER) -//#pragma warning(disable:265 ) // floating-point operation result is out of range -//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used -//#pragma warning(disable:869 ) // parameter was never referenced -//#pragma warning(disable:981 ) // operands are evaluated in unspecified order -//#pragma warning(disable:1418) // external function definition with no prior declaration -//#pragma warning(disable:1419) // external declaration in primary source file -//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable -//#pragma warning(disable:94 ) // the size of an array must be greater than zero -//#pragma warning(disable:1599) // declaration hides parameter -//#pragma warning(disable:424 ) // extra ";" ignored -#pragma warning(disable:2196) // routine is both "inline" and "noinline" -//#pragma warning(disable:177 ) // label was declared but never referenced -//#pragma warning(disable:114 ) // function was referenced but not defined -//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function -#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient -#endif - -#if defined(_MSC_VER) -//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union -#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) -//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data -#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data -//#pragma warning(disable:4355) // 'this' : used in base member initializer list -//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch -//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch -//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' -//#pragma warning(disable:4068) // unknown pragma -//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned -//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) -//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored -#pragma warning(disable:4503) // decorated name length exceeded, name was truncated -#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored -#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used - -# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) -# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings -# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 -# endif - -#endif - -#if defined(__clang__) && !defined(__INTEL_COMPILER) -//#pragma clang diagnostic ignored "-Wunknown-pragmas" -//#pragma clang diagnostic ignored "-Wunused-variable" -//#pragma clang diagnostic ignored "-Wreorder" -//#pragma clang diagnostic ignored "-Wmicrosoft" -//#pragma clang diagnostic ignored "-Wunused-private-field" -//#pragma clang diagnostic ignored "-Wunused-local-typedef" -//#pragma clang diagnostic ignored "-Wunused-function" -//#pragma clang diagnostic ignored "-Wnarrowing" -//#pragma clang diagnostic ignored "-Wc++11-narrowing" -//#pragma clang diagnostic ignored "-Wdeprecated-register" -//#pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif - -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) -#pragma GCC diagnostic ignored "-Wpragmas" -//#pragma GCC diagnostic ignored "-Wnarrowing" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -//#pragma GCC diagnostic ignored "-Warray-bounds" -#pragma GCC diagnostic ignored "-Wattributes" -#pragma GCC diagnostic ignored "-Wmisleading-indentation" -#pragma GCC diagnostic ignored "-Wsign-compare" -#pragma GCC diagnostic ignored "-Wparentheses" -#endif - -#if defined(__clang__) && defined(__WIN32__) -#pragma clang diagnostic ignored "-Wunused-parameter" -#pragma clang diagnostic ignored "-Wmicrosoft-cast" -#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" -#pragma clang diagnostic ignored "-Wmicrosoft-include" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunknown-pragmas" -#endif - -/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ -#if defined(__WIN32__) && defined(__INTEL_COMPILER) -#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated -#elif defined(__INTEL_COMPILER) -#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated -#elif defined(__clang__) -#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#elif defined(__GNUC__) -#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#elif defined(_MSC_VER) -#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated -#endif - -/* embree output stream */ -#define embree_ostream std::ostream& -#define embree_cout std::cout -#define embree_cout_uniform std::cout -#define embree_endl std::endl - -//////////////////////////////////////////////////////////////////////////////// -/// Some macros for static profiling -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__GNUC__) -#define IACA_SSC_MARK( MARK_ID ) \ -__asm__ __volatile__ ( \ - "\n\t movl $"#MARK_ID", %%ebx" \ - "\n\t .byte 0x64, 0x67, 0x90" \ - : : : "memory" ); - -#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); - -#else -#define IACA_UD_BYTES {__asm _emit 0x0F \ - __asm _emit 0x0B} - -#define IACA_SSC_MARK(x) {__asm mov ebx, x\ - __asm _emit 0x64 \ - __asm _emit 0x67 \ - __asm _emit 0x90 } - -#define IACA_VC64_START __writegsbyte(111, 111); -#define IACA_VC64_END __writegsbyte(222, 222); - -#endif - -#define IACA_START {IACA_UD_BYTES \ - IACA_SSC_MARK(111)} -#define IACA_END {IACA_SSC_MARK(222) \ - IACA_UD_BYTES} - -namespace embree -{ - template<typename Closure> - struct OnScopeExitHelper - { - OnScopeExitHelper (const Closure f) : active(true), f(f) {} - ~OnScopeExitHelper() { if (active) f(); } - void deactivate() { active = false; } - bool active; - const Closure f; - }; - - template <typename Closure> - OnScopeExitHelper<Closure> OnScopeExit(const Closure f) { - return OnScopeExitHelper<Closure>(f); - } - -#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) -#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 -#define ON_SCOPE_EXIT(code) \ - auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) - - template<typename Ty> - std::unique_ptr<Ty> make_unique(Ty* ptr) { - return std::unique_ptr<Ty>(ptr); - } - -} diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h deleted file mode 100644 index 24648e6234..0000000000 --- a/thirdparty/embree-aarch64/common/sys/ref.h +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "atomic.h" - -namespace embree -{ - struct NullTy { - }; - - extern MAYBE_UNUSED NullTy null; - - class RefCount - { - public: - RefCount(int val = 0) : refCounter(val) {} - virtual ~RefCount() {}; - - virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } - virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } - private: - std::atomic<size_t> refCounter; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Reference to single object - //////////////////////////////////////////////////////////////////////////////// - - template<typename Type> - class Ref - { - public: - Type* ptr; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Ref() : ptr(nullptr) {} - __forceinline Ref(NullTy) : ptr(nullptr) {} - __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } - __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } - - __forceinline Ref(Type* const input) : ptr(input) - { - if (ptr) - ptr->refInc(); - } - - __forceinline ~Ref() - { - if (ptr) - ptr->refDec(); - } - - __forceinline Ref& operator =(const Ref& input) - { - if (input.ptr) - input.ptr->refInc(); - if (ptr) - ptr->refDec(); - ptr = input.ptr; - return *this; - } - - __forceinline Ref& operator =(Ref&& input) - { - if (ptr) - ptr->refDec(); - ptr = input.ptr; - input.ptr = nullptr; - return *this; - } - - __forceinline Ref& operator =(Type* const input) - { - if (input) - input->refInc(); - if (ptr) - ptr->refDec(); - ptr = input; - return *this; - } - - __forceinline Ref& operator =(NullTy) - { - if (ptr) - ptr->refDec(); - ptr = nullptr; - return *this; - } - - __forceinline operator bool() const { return ptr != nullptr; } - - __forceinline const Type& operator *() const { return *ptr; } - __forceinline Type& operator *() { return *ptr; } - __forceinline const Type* operator ->() const { return ptr; } - __forceinline Type* operator ->() { return ptr; } - - template<typename TypeOut> - __forceinline Ref<TypeOut> cast() { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } - template<typename TypeOut> - __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } - - template<typename TypeOut> - __forceinline Ref<TypeOut> dynamicCast() { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } - template<typename TypeOut> - __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } - }; - - template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr < b.ptr; } - - template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy ) { return a.ptr == nullptr; } - template<typename Type> __forceinline bool operator ==(NullTy , const Ref<Type>& b) { return nullptr == b.ptr; } - template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr == b.ptr; } - - template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy ) { return a.ptr != nullptr; } - template<typename Type> __forceinline bool operator !=(NullTy , const Ref<Type>& b) { return nullptr != b.ptr; } - template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr != b.ptr; } -} diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp deleted file mode 100644 index d95ff8dfe0..0000000000 --- a/thirdparty/embree-aarch64/common/sys/regression.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "regression.h" - -namespace embree -{ - /* registerRegressionTest is invoked from static initializers, thus - * we cannot have the regression_tests variable as global static - * variable due to issues with static variable initialization - * order. */ - std::vector<RegressionTest*>& get_regression_tests() - { - static std::vector<RegressionTest*> regression_tests; - return regression_tests; - } - - void registerRegressionTest(RegressionTest* test) - { - get_regression_tests().push_back(test); - } - - RegressionTest* getRegressionTest(size_t index) - { - if (index >= get_regression_tests().size()) - return nullptr; - - return get_regression_tests()[index]; - } -} diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h deleted file mode 100644 index 632f8d92cf..0000000000 --- a/thirdparty/embree-aarch64/common/sys/regression.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -#include <vector> - -namespace embree -{ - /*! virtual interface for all regression tests */ - struct RegressionTest - { - RegressionTest (std::string name) : name(name) {} - virtual bool run() = 0; - std::string name; - }; - - /*! registers a regression test */ - void registerRegressionTest(RegressionTest* test); - - /*! run all regression tests */ - RegressionTest* getRegressionTest(size_t index); -} diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp deleted file mode 100644 index 931244383e..0000000000 --- a/thirdparty/embree-aarch64/common/sys/string.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "string.h" - -#include <algorithm> -#include <ctype.h> - -namespace embree -{ - char to_lower(char c) { return char(tolower(int(c))); } - char to_upper(char c) { return char(toupper(int(c))); } - std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } - std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } - - Vec2f string_to_Vec2f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); - return Vec2f(x,y); - } - - Vec3f string_to_Vec3f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); str = str.substr(next+1); - const float z = std::stof(str,&next); - return Vec3f(x,y,z); - } - - Vec4f string_to_Vec4f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); str = str.substr(next+1); - const float z = std::stof(str,&next); str = str.substr(next+1); - const float w = std::stof(str,&next); - return Vec4f(x,y,z,w); - } -} diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h deleted file mode 100644 index 2e9b0f88c3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/string.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "../math/vec2.h" -#include "../math/vec3.h" -#include "../math/vec4.h" - -namespace embree -{ - class IOStreamStateRestorer - { - public: - IOStreamStateRestorer(std::ostream& iostream) - : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { - } - - ~IOStreamStateRestorer() { - iostream.flags(flags); - iostream.precision(precision); - } - - private: - std::ostream& iostream; - std::ios::fmtflags flags; - std::streamsize precision; - }; - - std::string toLowerCase(const std::string& s); - std::string toUpperCase(const std::string& s); - - Vec2f string_to_Vec2f ( std::string str ); - Vec3f string_to_Vec3f ( std::string str ); - Vec4f string_to_Vec4f ( std::string str ); -} diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp deleted file mode 100644 index 1d11436770..0000000000 --- a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp +++ /dev/null @@ -1,676 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "sysinfo.h" -#include "intrinsics.h" -#include "string.h" -#include "ref.h" -#if defined(__FREEBSD__) -#include <sys/cpuset.h> -#include <pthread_np.h> -typedef cpuset_t cpu_set_t; -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -namespace embree -{ - NullTy null; - - std::string getPlatformName() - { -#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON) - return "Android Linux (aarch64 / arm64)"; -#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__) - return "Android Linux (x64)"; -#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86)) - return "Android Linux (x86)"; -#elif defined(__LINUX__) && !defined(__X86_64__) - return "Linux (32bit)"; -#elif defined(__LINUX__) && defined(__X86_64__) - return "Linux (64bit)"; -#elif defined(__FREEBSD__) && !defined(__X86_64__) - return "FreeBSD (32bit)"; -#elif defined(__FREEBSD__) && defined(__X86_64__) - return "FreeBSD (64bit)"; -#elif defined(__CYGWIN__) && !defined(__X86_64__) - return "Cygwin (32bit)"; -#elif defined(__CYGWIN__) && defined(__X86_64__) - return "Cygwin (64bit)"; -#elif defined(__WIN32__) && !defined(__X86_64__) - return "Windows (32bit)"; -#elif defined(__WIN32__) && defined(__X86_64__) - return "Windows (64bit)"; -#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__) - return "iOS Simulator (x64)"; -#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON) - return "iOS (aarch64 / arm64)"; -#elif defined(__MACOSX__) && !defined(__X86_64__) - return "Mac OS X (32bit)"; -#elif defined(__MACOSX__) && defined(__X86_64__) - return "Mac OS X (64bit)"; -#elif defined(__UNIX__) && defined(__aarch64__) - return "Unix (aarch64)"; -#elif defined(__UNIX__) && !defined(__X86_64__) - return "Unix (32bit)"; -#elif defined(__UNIX__) && defined(__X86_64__) - return "Unix (64bit)"; -#else - return "Unknown"; -#endif - } - - std::string getCompilerName() - { -#if defined(__INTEL_COMPILER) - int icc_mayor = __INTEL_COMPILER / 100 % 100; - int icc_minor = __INTEL_COMPILER % 100; - std::string version = "Intel Compiler "; - version += toString(icc_mayor); - version += "." + toString(icc_minor); -#if defined(__INTEL_COMPILER_UPDATE) - version += "." + toString(__INTEL_COMPILER_UPDATE); -#endif - return version; -#elif defined(__clang__) - return "CLANG " __clang_version__; -#elif defined (__GNUC__) - return "GCC " __VERSION__; -#elif defined(_MSC_VER) - std::string version = toString(_MSC_FULL_VER); - version.insert(4,"."); - version.insert(9,"."); - version.insert(2,"."); - return "Visual C++ Compiler " + version; -#else - return "Unknown Compiler"; -#endif - } - - std::string getCPUVendor() - { - int cpuinfo[4]; - __cpuid (cpuinfo, 0); - int name[4]; - name[0] = cpuinfo[1]; - name[1] = cpuinfo[3]; - name[2] = cpuinfo[2]; - name[3] = 0; - return (char*)name; - } - - CPU getCPUModel() - { - if (getCPUVendor() != "GenuineIntel") - return CPU::UNKNOWN; - - int out[4]; - __cpuid(out, 0); - if (out[0] < 1) return CPU::UNKNOWN; - __cpuid(out, 1); - - /* please see CPUID documentation for these formulas */ - uint32_t family_ID = (out[0] >> 8) & 0x0F; - uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; - - uint32_t model_ID = (out[0] >> 4) & 0x0F; - uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; - - uint32_t DisplayFamily = family_ID; - if (family_ID == 0x0F) - DisplayFamily += extended_family_ID; - - uint32_t DisplayModel = model_ID; - if (family_ID == 0x06 || family_ID == 0x0F) - DisplayModel += extended_model_ID << 4; - - uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); - - // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) - if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; - if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; - if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; - if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; - if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; - if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; - if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; - if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; - if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; - if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; - if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; - if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; - if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; - if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; - - if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; - if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; - - return CPU::UNKNOWN; - } - - std::string stringOfCPUModel(CPU model) - { - switch (model) { - case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; - case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; - case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; - case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; - case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; - case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; - case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; - case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; - case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; - case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; - case CPU::XEON_BROADWELL : return "Xeon Broadwell"; - case CPU::CORE_BROADWELL : return "Core Broadwell"; - case CPU::XEON_HASWELL : return "Xeon Haswell"; - case CPU::CORE_HASWELL : return "Core Haswell"; - case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; - case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; - case CPU::SANDY_BRIDGE : return "Sandy Bridge"; - case CPU::NEHALEM : return "Nehalem"; - case CPU::CORE2 : return "Core2"; - case CPU::CORE1 : return "Core"; - case CPU::ARM : return "Arm"; - case CPU::UNKNOWN : return "Unknown CPU"; - } - return "Unknown CPU (error)"; - } - -#if !defined(__ARM_NEON) - /* constants to access destination registers of CPUID instruction */ - static const int EAX = 0; - static const int EBX = 1; - static const int ECX = 2; - static const int EDX = 3; - - /* cpuid[eax=1].ecx */ - static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; - static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; - static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; - static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; - static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; - //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; - static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; - //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; - static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; - static const int CPU_FEATURE_BIT_AVX = 1 << 28; - static const int CPU_FEATURE_BIT_F16C = 1 << 29; - static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; - - /* cpuid[eax=1].edx */ - static const int CPU_FEATURE_BIT_SSE = 1 << 25; - static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; - - /* cpuid[eax=0x80000001].ecx */ - static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; - - /* cpuid[eax=7,ecx=0].ebx */ - static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; - static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; - static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; - static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) - static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) - static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) - static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) - static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) - static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) - static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) - static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) - - /* cpuid[eax=7,ecx=0].ecx */ - static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) -#endif - -#if !defined(__ARM_NEON) - __noinline int64_t get_xcr0() - { - // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466 -#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK) - int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 - xcr0 = _xgetbv(0); - return xcr0; -#else - int xcr0 = 0; - __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); - return xcr0; -#endif - } -#endif - - int getCPUFeatures() - { -#if defined(__ARM_NEON) - int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; -#if defined(NEON_AVX2_EMULATION) - cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; - cpu_features |= CPU_FEATURE_XMM_ENABLED; - cpu_features |= CPU_FEATURE_YMM_ENABLED; - cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; - cpu_features |= CPU_FEATURE_POPCNT; - cpu_features |= CPU_FEATURE_AVX; - cpu_features |= CPU_FEATURE_AVX2; - cpu_features |= CPU_FEATURE_FMA3; - cpu_features |= CPU_FEATURE_LZCNT; - cpu_features |= CPU_FEATURE_BMI1; - cpu_features |= CPU_FEATURE_BMI2; - cpu_features |= CPU_FEATURE_NEON_2X; - - - -#endif - return cpu_features; - -#else - /* cache CPU features access */ - static int cpu_features = 0; - if (cpu_features) - return cpu_features; - - /* get number of CPUID leaves */ - int cpuid_leaf0[4]; - __cpuid(cpuid_leaf0, 0x00000000); - unsigned nIds = cpuid_leaf0[EAX]; - - /* get number of extended CPUID leaves */ - int cpuid_leafe[4]; - __cpuid(cpuid_leafe, 0x80000000); - unsigned nExIds = cpuid_leafe[EAX]; - - /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ - int cpuid_leaf_1[4] = { 0,0,0,0 }; - int cpuid_leaf_7[4] = { 0,0,0,0 }; - int cpuid_leaf_e1[4] = { 0,0,0,0 }; - if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); -#if _WIN32 -#if _MSC_VER && (_MSC_FULL_VER < 160040219) -#else - if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); -#endif -#else - if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); -#endif - if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); - - /* detect if OS saves XMM, YMM, and ZMM states */ - bool xmm_enabled = true; - bool ymm_enabled = false; - bool zmm_enabled = false; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { - int64_t xcr0 = get_xcr0(); - xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ - ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ - zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ - } - if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; - if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; - if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; - - if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; - if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; - - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; - if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; - if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; - if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; - - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; - if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; - - return cpu_features; -#endif - } - - std::string stringOfCPUFeatures(int features) - { - std::string str; - if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; - if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; - if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; - if (features & CPU_FEATURE_SSE ) str += "SSE "; - if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; - if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; - if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; - if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; - if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; - if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; - if (features & CPU_FEATURE_AVX ) str += "AVX "; - if (features & CPU_FEATURE_F16C ) str += "F16C "; - if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; - if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; - if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; - if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; - if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; - if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; - if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; - if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; - if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; - if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; - if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; - if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; - if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; - if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; - if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; - if (features & CPU_FEATURE_NEON) str += "NEON "; - if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; - return str; - } - - std::string stringOfISA (int isa) - { - if (isa == SSE) return "SSE"; - if (isa == SSE2) return "SSE2"; - if (isa == SSE3) return "SSE3"; - if (isa == SSSE3) return "SSSE3"; - if (isa == SSE41) return "SSE4.1"; - if (isa == SSE42) return "SSE4.2"; - if (isa == AVX) return "AVX"; - if (isa == AVX2) return "AVX2"; - if (isa == AVX512KNL) return "AVX512KNL"; - if (isa == AVX512SKX) return "AVX512SKX"; - if (isa == NEON) return "NEON"; - if (isa == NEON_2X) return "2xNEON"; - return "UNKNOWN"; - } - - bool hasISA(int features, int isa) { - return (features & isa) == isa; - } - - std::string supportedTargetList (int features) - { - std::string v; - if (hasISA(features,SSE)) v += "SSE "; - if (hasISA(features,SSE2)) v += "SSE2 "; - if (hasISA(features,SSE3)) v += "SSE3 "; - if (hasISA(features,SSSE3)) v += "SSSE3 "; - if (hasISA(features,SSE41)) v += "SSE4.1 "; - if (hasISA(features,SSE42)) v += "SSE4.2 "; - if (hasISA(features,AVX)) v += "AVX "; - if (hasISA(features,AVXI)) v += "AVXI "; - if (hasISA(features,AVX2)) v += "AVX2 "; - if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; - if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; - if (hasISA(features,NEON)) v += "NEON "; - if (hasISA(features,NEON_2X)) v += "2xNEON "; - return v; - } -} - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> -#include <psapi.h> - -namespace embree -{ - std::string getExecutableFileName() { - char filename[1024]; - if (!GetModuleFileName(nullptr, filename, sizeof(filename))) - return std::string(); - return std::string(filename); - } - - unsigned int getNumberOfLogicalThreads() - { - static int nThreads = -1; - if (nThreads != -1) return nThreads; - - typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); - typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); - HMODULE hlib = LoadLibrary("Kernel32"); - GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); - GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); - - if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) - { - int groups = pGetActiveProcessorGroupCount(); - int totalProcessors = 0; - for (int i = 0; i < groups; i++) - totalProcessors += pGetActiveProcessorCount(i); - nThreads = totalProcessors; - } - else - { - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - nThreads = sysinfo.dwNumberOfProcessors; - } - assert(nThreads); - return nThreads; - } - - int getTerminalWidth() - { - HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); - if (handle == INVALID_HANDLE_VALUE) return 80; - CONSOLE_SCREEN_BUFFER_INFO info; - memset(&info,0,sizeof(info)); - GetConsoleScreenBufferInfo(handle, &info); - return info.dwSize.X; - } - - double getSeconds() - { - LARGE_INTEGER freq, val; - QueryPerformanceFrequency(&freq); - QueryPerformanceCounter(&val); - return (double)val.QuadPart / (double)freq.QuadPart; - } - - void sleepSeconds(double t) { - Sleep(DWORD(1000.0*t)); - } - - size_t getVirtualMemoryBytes() - { - PROCESS_MEMORY_COUNTERS info; - GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); - return (size_t)info.QuotaPeakPagedPoolUsage; - } - - size_t getResidentMemoryBytes() - { - PROCESS_MEMORY_COUNTERS info; - GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); - return (size_t)info.WorkingSetSize; - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Linux Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__LINUX__) - -#include <stdio.h> -#include <unistd.h> - -namespace embree -{ - std::string getExecutableFileName() - { - std::string pid = "/proc/" + toString(getpid()) + "/exe"; - char buf[4096]; - memset(buf,0,sizeof(buf)); - if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() - { - size_t virt, resident, shared; - std::ifstream buffer("/proc/self/statm"); - buffer >> virt >> resident >> shared; - return virt*sysconf(_SC_PAGE_SIZE); - } - - size_t getResidentMemoryBytes() - { - size_t virt, resident, shared; - std::ifstream buffer("/proc/self/statm"); - buffer >> virt >> resident >> shared; - return resident*sysconf(_SC_PAGE_SIZE); - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// FreeBSD Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__FreeBSD__) - -#include <sys/sysctl.h> - -namespace embree -{ - std::string getExecutableFileName() - { - const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; - char buf[4096]; - memset(buf,0,sizeof(buf)); - size_t len = sizeof(buf)-1; - if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() { - return 0; - } - - size_t getResidentMemoryBytes() { - return 0; - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Mac OS X Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__MACOSX__) - -#include <mach-o/dyld.h> - -namespace embree -{ - std::string getExecutableFileName() - { - char buf[4096]; - uint32_t size = sizeof(buf); - if (_NSGetExecutablePath(buf, &size) != 0) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() { - return 0; - } - - size_t getResidentMemoryBytes() { - return 0; - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include <unistd.h> -#include <sys/ioctl.h> -#include <sys/time.h> -#include <pthread.h> - -namespace embree -{ - unsigned int getNumberOfLogicalThreads() - { - static int nThreads = -1; - if (nThreads != -1) return nThreads; - -#if defined(__MACOSX__) || defined(__ANDROID__) - nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container - assert(nThreads); -#else - cpu_set_t set; - if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) - nThreads = CPU_COUNT(&set); -#endif - - assert(nThreads); - return nThreads; - } - - int getTerminalWidth() - { - struct winsize info; - if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; - return info.ws_col; - } - - double getSeconds() { - struct timeval tp; gettimeofday(&tp,nullptr); - return double(tp.tv_sec) + double(tp.tv_usec)/1E6; - } - - void sleepSeconds(double t) { - usleep(1000000.0*t); - } -} -#endif - diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h deleted file mode 100644 index 8e313a59b3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/sysinfo.h +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define CACHELINE_SIZE 64 - -#if !defined(PAGE_SIZE) - #define PAGE_SIZE 4096 -#endif - -#define PAGE_SIZE_2M (2*1024*1024) -#define PAGE_SIZE_4K (4*1024) - -#include "platform.h" - -/* define isa namespace and ISA bitvector */ -#if defined (__AVX512VL__) -# define isa avx512skx -# define ISA AVX512SKX -# define ISA_STR "AVX512SKX" -#elif defined (__AVX512F__) -# define isa avx512knl -# define ISA AVX512KNL -# define ISA_STR "AVX512KNL" -#elif defined (__AVX2__) -# define isa avx2 -# define ISA AVX2 -# define ISA_STR "AVX2" -#elif defined(__AVXI__) -# define isa avxi -# define ISA AVXI -# define ISA_STR "AVXI" -#elif defined(__AVX__) -# define isa avx -# define ISA AVX -# define ISA_STR "AVX" -#elif defined (__SSE4_2__) -# define isa sse42 -# define ISA SSE42 -# define ISA_STR "SSE4.2" -//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 -//# define isa sse41 -//# define ISA SSE41 -//# define ISA_STR "SSE4.1" -//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC -//# define isa ssse3 -//# define ISA SSSE3 -//# define ISA_STR "SSSE3" -//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang -//# define isa sse3 -//# define ISA SSE3 -//# define ISA_STR "SSE3" -#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) -# define isa sse2 -# define ISA SSE2 -# define ISA_STR "SSE2" -#elif defined(__SSE__) -# define isa sse -# define ISA SSE -# define ISA_STR "SSE" -#elif defined(__ARM_NEON) -// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. -#define isa sse2 -#define ISA NEON -#define ISA_STR "NEON" -#else -#error Unknown ISA -#endif - -namespace embree -{ - enum class CPU - { - XEON_ICE_LAKE, - CORE_ICE_LAKE, - CORE_TIGER_LAKE, - CORE_COMET_LAKE, - CORE_CANNON_LAKE, - CORE_KABY_LAKE, - XEON_SKY_LAKE, - CORE_SKY_LAKE, - XEON_PHI_KNIGHTS_MILL, - XEON_PHI_KNIGHTS_LANDING, - XEON_BROADWELL, - CORE_BROADWELL, - XEON_HASWELL, - CORE_HASWELL, - XEON_IVY_BRIDGE, - CORE_IVY_BRIDGE, - SANDY_BRIDGE, - NEHALEM, - CORE2, - CORE1, - ARM, - UNKNOWN, - }; - - /*! get the full path to the running executable */ - std::string getExecutableFileName(); - - /*! return platform name */ - std::string getPlatformName(); - - /*! get the full name of the compiler */ - std::string getCompilerName(); - - /*! return the name of the CPU */ - std::string getCPUVendor(); - - /*! get microprocessor model */ - CPU getCPUModel(); - - /*! converts CPU model into string */ - std::string stringOfCPUModel(CPU model); - - /*! CPU features */ - static const int CPU_FEATURE_SSE = 1 << 0; - static const int CPU_FEATURE_SSE2 = 1 << 1; - static const int CPU_FEATURE_SSE3 = 1 << 2; - static const int CPU_FEATURE_SSSE3 = 1 << 3; - static const int CPU_FEATURE_SSE41 = 1 << 4; - static const int CPU_FEATURE_SSE42 = 1 << 5; - static const int CPU_FEATURE_POPCNT = 1 << 6; - static const int CPU_FEATURE_AVX = 1 << 7; - static const int CPU_FEATURE_F16C = 1 << 8; - static const int CPU_FEATURE_RDRAND = 1 << 9; - static const int CPU_FEATURE_AVX2 = 1 << 10; - static const int CPU_FEATURE_FMA3 = 1 << 11; - static const int CPU_FEATURE_LZCNT = 1 << 12; - static const int CPU_FEATURE_BMI1 = 1 << 13; - static const int CPU_FEATURE_BMI2 = 1 << 14; - static const int CPU_FEATURE_AVX512F = 1 << 16; - static const int CPU_FEATURE_AVX512DQ = 1 << 17; - static const int CPU_FEATURE_AVX512PF = 1 << 18; - static const int CPU_FEATURE_AVX512ER = 1 << 19; - static const int CPU_FEATURE_AVX512CD = 1 << 20; - static const int CPU_FEATURE_AVX512BW = 1 << 21; - static const int CPU_FEATURE_AVX512VL = 1 << 22; - static const int CPU_FEATURE_AVX512IFMA = 1 << 23; - static const int CPU_FEATURE_AVX512VBMI = 1 << 24; - static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; - static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; - static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; - static const int CPU_FEATURE_NEON = 1 << 28; - static const int CPU_FEATURE_NEON_2X = 1 << 29; - - /*! get CPU features */ - int getCPUFeatures(); - - /*! convert CPU features into a string */ - std::string stringOfCPUFeatures(int features); - - /*! creates a string of all supported targets that are supported */ - std::string supportedTargetList (int isa); - - /*! ISAs */ - static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; - static const int SSE2 = SSE | CPU_FEATURE_SSE2; - static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; - static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; - static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; - static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; - static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; - static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; - static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; - static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; - static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; - static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; - static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; - - /*! converts ISA bitvector into a string */ - std::string stringOfISA(int features); - - /*! return the number of logical threads of the system */ - unsigned int getNumberOfLogicalThreads(); - - /*! returns the size of the terminal window in characters */ - int getTerminalWidth(); - - /*! returns performance counter in seconds */ - double getSeconds(); - - /*! sleeps the specified number of seconds */ - void sleepSeconds(double t); - - /*! returns virtual address space occupied by process */ - size_t getVirtualMemoryBytes(); - - /*! returns resident memory required by process */ - size_t getResidentMemoryBytes(); -} diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp deleted file mode 100644 index f9ea5b7d96..0000000000 --- a/thirdparty/embree-aarch64/common/sys/thread.cpp +++ /dev/null @@ -1,429 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "thread.h" -#include "sysinfo.h" -#include "string.h" - -#include <iostream> -#if defined(__ARM_NEON) -#include "../math/SSE2NEON.h" -#else -#include <xmmintrin.h> -#endif - -#if defined(PTHREADS_WIN32) -#pragma comment (lib, "pthreadVC.lib") -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -namespace embree -{ - /*! set the affinity of a given thread */ - void setAffinity(HANDLE thread, ssize_t affinity) - { - typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); - typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); - typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); - typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); - HMODULE hlib = LoadLibrary("Kernel32"); - GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); - GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); - SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); - SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); - if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) - { - int groups = pGetActiveProcessorGroupCount(); - int totalProcessors = 0, group = 0, number = 0; - for (int i = 0; i<groups; i++) { - int processors = pGetActiveProcessorCount(i); - if (totalProcessors + processors > affinity) { - group = i; - number = (int)affinity - totalProcessors; - break; - } - totalProcessors += processors; - } - - GROUP_AFFINITY groupAffinity; - groupAffinity.Group = (WORD)group; - groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); - groupAffinity.Reserved[0] = 0; - groupAffinity.Reserved[1] = 0; - groupAffinity.Reserved[2] = 0; - if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) - WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning - - PROCESSOR_NUMBER processorNumber; - processorNumber.Group = group; - processorNumber.Number = number; - processorNumber.Reserved = 0; - if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) - WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning - } - else - { - if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) - WARNING("SetThreadAffinityMask failed"); // on purpose only a warning - if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) - WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning - } - } - - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) { - setAffinity(GetCurrentThread(), affinity); - } - - struct ThreadStartupData - { - public: - ThreadStartupData (thread_func f, void* arg) - : f(f), arg(arg) {} - public: - thread_func f; - void* arg; - }; - - DWORD WINAPI threadStartup(LPVOID ptr) - { - ThreadStartupData* parg = (ThreadStartupData*) ptr; - _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); - parg->f(parg->arg); - delete parg; - parg = nullptr; - return 0; - } - -#if !defined(PTHREADS_WIN32) - - /*! creates a hardware thread running on specific core */ - thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) - { - HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); - if (thread == nullptr) FATAL("CreateThread failed"); - if (threadID >= 0) setAffinity(thread, threadID); - return thread_t(thread); - } - - /*! the thread calling this function gets yielded */ - void yield() { - SwitchToThread(); - } - - /*! waits until the given thread has terminated */ - void join(thread_t tid) { - WaitForSingleObject(HANDLE(tid), INFINITE); - CloseHandle(HANDLE(tid)); - } - - /*! creates thread local storage */ - tls_t createTls() { - return tls_t(size_t(TlsAlloc())); - } - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr) { - TlsSetValue(DWORD(size_t(tls)), ptr); - } - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls) { - return TlsGetValue(DWORD(size_t(tls))); - } - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls) { - TlsFree(DWORD(size_t(tls))); - } -#endif -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Linux Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__LINUX__) - -#include <fstream> -#include <sstream> -#include <algorithm> - -#if defined(__ANDROID__) -#include <pthread.h> -#endif - -namespace embree -{ - static MutexSys mutex; - static std::vector<size_t> threadIDs; - -#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target - /* changes thread ID mapping such that we first fill up all thread on one core */ - size_t mapThreadID(size_t threadID) - { - Lock<MutexSys> lock(mutex); - - if (threadIDs.size() == 0) - { - /* parse thread/CPU topology */ - for (size_t cpuID=0;;cpuID++) - { - std::fstream fs; - std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); - fs.open (cpu.c_str(), std::fstream::in); - if (fs.fail()) break; - - int i; - while (fs >> i) - { - if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) - threadIDs.push_back(i); - if (fs.peek() == ',') - fs.ignore(); - } - fs.close(); - } - -#if 0 - for (size_t i=0;i<threadIDs.size();i++) - std::cout << i << " -> " << threadIDs[i] << std::endl; -#endif - - /* verify the mapping and do not use it if the mapping has errors */ - for (size_t i=0;i<threadIDs.size();i++) { - for (size_t j=0;j<threadIDs.size();j++) { - if (i != j && threadIDs[i] == threadIDs[j]) { - threadIDs.clear(); - } - } - } - } - - /* re-map threadIDs if mapping is available */ - size_t ID = threadID; - if (threadID < threadIDs.size()) - ID = threadIDs[threadID]; - - /* find correct thread to affinitize to */ - cpu_set_t set; - if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) - { - for (int i=0, j=0; i<CPU_SETSIZE; i++) - { - if (!CPU_ISSET(i,&set)) continue; - - if (j == ID) { - ID = i; - break; - } - j++; - } - } - - return ID; - } -#endif - - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) - { -#if defined(__ANDROID__) - // TODO(LTE): Implement -#else - cpu_set_t cset; - CPU_ZERO(&cset); - size_t threadID = mapThreadID(affinity); - CPU_SET(threadID, &cset); - - pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); -#endif - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// FreeBSD Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__FreeBSD__) - -#include <pthread_np.h> - -namespace embree -{ - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) - { - cpuset_t cset; - CPU_ZERO(&cset); - CPU_SET(affinity, &cset); - - pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// MacOSX Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__MACOSX__) - -#include <mach/thread_act.h> -#include <mach/thread_policy.h> -#include <mach/mach_init.h> - -namespace embree -{ - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) - { - thread_affinity_policy ap; - ap.affinity_tag = affinity; - if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) - WARNING("setting thread affinity failed"); // on purpose only a warning - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) - -#include <pthread.h> -#include <sched.h> - -#if defined(__USE_NUMA__) -#include <numa.h> -#endif - -namespace embree -{ - struct ThreadStartupData - { - public: - ThreadStartupData (thread_func f, void* arg, int affinity) - : f(f), arg(arg), affinity(affinity) {} - public: - thread_func f; - void* arg; - ssize_t affinity; - }; - - static void* threadStartup(ThreadStartupData* parg) - { - _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); - - /*! Mac OS X does not support setting affinity at thread creation time */ -#if defined(__MACOSX__) - if (parg->affinity >= 0) - setAffinity(parg->affinity); -#endif - - parg->f(parg->arg); - delete parg; - parg = nullptr; - return nullptr; - } - - /*! creates a hardware thread running on specific core */ - thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) - { - /* set stack size */ - pthread_attr_t attr; - pthread_attr_init(&attr); - if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); - - /* create thread */ - pthread_t* tid = new pthread_t; - if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { - pthread_attr_destroy(&attr); - delete tid; - FATAL("pthread_create failed"); - } - pthread_attr_destroy(&attr); - - /* set affinity */ -#if defined(__LINUX__) && !defined(__ANDROID__) - if (threadID >= 0) { - cpu_set_t cset; - CPU_ZERO(&cset); - threadID = mapThreadID(threadID); - CPU_SET(threadID, &cset); - pthread_setaffinity_np(*tid, sizeof(cset), &cset); - } -#elif defined(__FreeBSD__) - if (threadID >= 0) { - cpuset_t cset; - CPU_ZERO(&cset); - CPU_SET(threadID, &cset); - pthread_setaffinity_np(*tid, sizeof(cset), &cset); - } -#endif - - return thread_t(tid); - } - - /*! the thread calling this function gets yielded */ - void yield() { - sched_yield(); - } - - /*! waits until the given thread has terminated */ - void join(thread_t tid) { - if (pthread_join(*(pthread_t*)tid, nullptr) != 0) - FATAL("pthread_join failed"); - delete (pthread_t*)tid; - } - - /*! creates thread local storage */ - tls_t createTls() - { - pthread_key_t* key = new pthread_key_t; - if (pthread_key_create(key,nullptr) != 0) { - delete key; - FATAL("pthread_key_create failed"); - } - - return tls_t(key); - } - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls) - { - assert(tls); - return pthread_getspecific(*(pthread_key_t*)tls); - } - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr) - { - assert(tls); - if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) - FATAL("pthread_setspecific failed"); - } - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls) - { - assert(tls); - if (pthread_key_delete(*(pthread_key_t*)tls) != 0) - FATAL("pthread_key_delete failed"); - delete (pthread_key_t*)tls; - } -} - -#endif diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h deleted file mode 100644 index 45da6e6a70..0000000000 --- a/thirdparty/embree-aarch64/common/sys/thread.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "mutex.h" -#include "alloc.h" -#include "vector.h" -#include <vector> - -namespace embree -{ - /*! type for thread */ - typedef struct opaque_thread_t* thread_t; - - /*! signature of thread start function */ - typedef void (*thread_func)(void*); - - /*! creates a hardware thread running on specific logical thread */ - thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); - - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity); - - /*! the thread calling this function gets yielded */ - void yield(); - - /*! waits until the given thread has terminated */ - void join(thread_t tid); - - /*! type for handle to thread local storage */ - typedef struct opaque_tls_t* tls_t; - - /*! creates thread local storage */ - tls_t createTls(); - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr); - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls); - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls); -} diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h deleted file mode 100644 index e41794de7c..0000000000 --- a/thirdparty/embree-aarch64/common/sys/vector.h +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "alloc.h" -#include <algorithm> - -namespace embree -{ - template<typename T, typename allocator> - class vector_t - { - public: - typedef T value_type; - typedef T* iterator; - typedef const T* const_iterator; - - __forceinline vector_t () - : size_active(0), size_alloced(0), items(nullptr) {} - - __forceinline explicit vector_t (size_t sz) - : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } - - template<typename M> - __forceinline explicit vector_t (M alloc, size_t sz) - : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } - - __forceinline ~vector_t() { - clear(); - } - - __forceinline vector_t (const vector_t& other) - { - size_active = other.size_active; - size_alloced = other.size_alloced; - items = alloc.allocate(size_alloced); - for (size_t i=0; i<size_active; i++) - ::new (&items[i]) value_type(other.items[i]); - } - - __forceinline vector_t (vector_t&& other) - : alloc(std::move(other.alloc)) - { - size_active = other.size_active; other.size_active = 0; - size_alloced = other.size_alloced; other.size_alloced = 0; - items = other.items; other.items = nullptr; - } - - __forceinline vector_t& operator=(const vector_t& other) - { - resize(other.size_active); - for (size_t i=0; i<size_active; i++) - items[i] = value_type(other.items[i]); - return *this; - } - - __forceinline vector_t& operator=(vector_t&& other) - { - clear(); - alloc = std::move(other.alloc); - size_active = other.size_active; other.size_active = 0; - size_alloced = other.size_alloced; other.size_alloced = 0; - items = other.items; other.items = nullptr; - return *this; - } - - /********************** Iterators ****************************/ - - __forceinline iterator begin() { return items; }; - __forceinline const_iterator begin() const { return items; }; - - __forceinline iterator end () { return items+size_active; }; - __forceinline const_iterator end () const { return items+size_active; }; - - - /********************** Capacity ****************************/ - - __forceinline bool empty () const { return size_active == 0; } - __forceinline size_t size () const { return size_active; } - __forceinline size_t capacity () const { return size_alloced; } - - - __forceinline void resize(size_t new_size) { - internal_resize(new_size,internal_grow_size(new_size)); - } - - __forceinline void reserve(size_t new_alloced) - { - /* do nothing if container already large enough */ - if (new_alloced <= size_alloced) - return; - - /* resize exact otherwise */ - internal_resize(size_active,new_alloced); - } - - __forceinline void shrink_to_fit() { - internal_resize(size_active,size_active); - } - - /******************** Element access **************************/ - - __forceinline T& operator[](size_t i) { assert(i < size_active); return items[i]; } - __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; } - - __forceinline T& at(size_t i) { assert(i < size_active); return items[i]; } - __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; } - - __forceinline T& front() const { assert(size_active > 0); return items[0]; }; - __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - - /******************** Modifiers **************************/ - - __forceinline void push_back(const T& nt) - { - const T v = nt; // need local copy as input reference could point to this vector - internal_resize(size_active,internal_grow_size(size_active+1)); - ::new (&items[size_active++]) T(v); - } - - __forceinline void pop_back() - { - assert(!empty()); - size_active--; - alloc.destroy(&items[size_active]); - } - - __forceinline void clear() - { - /* destroy elements */ - for (size_t i=0; i<size_active; i++) - alloc.destroy(&items[i]); - - /* free memory */ - alloc.deallocate(items,size_alloced); - items = nullptr; - size_active = size_alloced = 0; - } - - /******************** Comparisons **************************/ - - friend bool operator== (const vector_t& a, const vector_t& b) - { - if (a.size() != b.size()) return false; - for (size_t i=0; i<a.size(); i++) - if (a[i] != b[i]) - return false; - return true; - } - - friend bool operator!= (const vector_t& a, const vector_t& b) { - return !(a==b); - } - - private: - - __forceinline void internal_resize_init(size_t new_active) - { - assert(size_active == 0); - assert(size_alloced == 0); - assert(items == nullptr); - if (new_active == 0) return; - items = alloc.allocate(new_active); - for (size_t i=0; i<new_active; i++) ::new (&items[i]) T(); - size_active = new_active; - size_alloced = new_active; - } - - __forceinline void internal_resize(size_t new_active, size_t new_alloced) - { - assert(new_active <= new_alloced); - - /* destroy elements */ - if (new_active < size_active) - { - for (size_t i=new_active; i<size_active; i++) - alloc.destroy(&items[i]); - size_active = new_active; - } - - /* only reallocate if necessary */ - if (new_alloced == size_alloced) { - for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T; - size_active = new_active; - return; - } - - /* reallocate and copy items */ - T* old_items = items; - items = alloc.allocate(new_alloced); - for (size_t i=0; i<size_active; i++) { - ::new (&items[i]) T(std::move(old_items[i])); - alloc.destroy(&old_items[i]); - } - - for (size_t i=size_active; i<new_active; i++) { - ::new (&items[i]) T; - } - - alloc.deallocate(old_items,size_alloced); - size_active = new_active; - size_alloced = new_alloced; - } - - __forceinline size_t internal_grow_size(size_t new_alloced) - { - /* do nothing if container already large enough */ - if (new_alloced <= size_alloced) - return size_alloced; - - /* resize to next power of 2 otherwise */ - size_t new_size_alloced = size_alloced; - while (new_size_alloced < new_alloced) { - new_size_alloced = std::max(size_t(1),2*new_size_alloced); - } - return new_size_alloced; - } - - private: - allocator alloc; - size_t size_active; // number of valid items - size_t size_alloced; // number of items allocated - T* items; // data array - }; - - /*! vector class that performs standard allocations */ - template<typename T> - using vector = vector_t<T,std::allocator<T>>; - - /*! vector class that performs aligned allocations */ - template<typename T> - using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >; - - /*! vector class that performs OS allocations */ - template<typename T> - using ovector = vector_t<T,os_allocator<T> >; -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h deleted file mode 100644 index 9940e068d0..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#if defined(TASKING_INTERNAL) -# include "taskschedulerinternal.h" -#elif defined(TASKING_GCD) && defined(BUILD_IOS) -# include "taskschedulergcd.h" -#elif defined(TASKING_TBB) -# include "taskschedulertbb.h" -#elif defined(TASKING_PPL) -# include "taskschedulerppl.h" -#else -# error "no tasking system enabled" -#endif - diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h deleted file mode 100644 index d31f8bb478..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#include <dispatch/dispatch.h> - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy() {} - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() - { - return threadIndex(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - static __forceinline size_t threadIndex() - { - currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads; - return currentThreadIndex; - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() - { - return GCDNumThreads; - } - - private: - static size_t GCDNumThreads; - static size_t currentThreadIndex; - - }; - -}; - diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp deleted file mode 100644 index ebf656d1a0..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp +++ /dev/null @@ -1,426 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "taskschedulerinternal.h" -#include "../math/math.h" -#include "../sys/sysinfo.h" -#include <algorithm> - -namespace embree -{ - RTC_NAMESPACE_BEGIN - - static MutexSys g_mutex; - size_t TaskScheduler::g_numThreads = 0; - __thread TaskScheduler* TaskScheduler::g_instance = nullptr; - std::vector<Ref<TaskScheduler>> g_instance_vector; - __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; - TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; - - template<typename Predicate, typename Body> - __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) - { - while (true) - { - /*! some rounds that yield */ - for (size_t i=0; i<32; i++) - { - /*! some spinning rounds */ - const size_t threadCount = thread.threadCount(); - for (size_t j=0; j<1024; j+=threadCount) - { - if (!pred()) return; - if (thread.scheduler->steal_from_other_threads(thread)) { - i=j=0; - body(); - } - } - yield(); - } - } - } - - /*! run this task */ - void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible - { - /* try to run if not already stolen */ - if (try_switch_state(INITIALIZED,DONE)) - { - Task* prevTask = thread.task; - thread.task = this; - // -- GODOT start -- - // try { - // if (thread.scheduler->cancellingException == nullptr) - closure->execute(); - // } catch (...) { - // if (thread.scheduler->cancellingException == nullptr) - // thread.scheduler->cancellingException = std::current_exception(); - // } - // -- GODOT end -- - thread.task = prevTask; - add_dependencies(-1); - } - - /* steal until all dependencies have completed */ - steal_loop(thread, - [&] () { return dependencies>0; }, - [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); - - /* now signal our parent task that we are finished */ - if (parent) - parent->add_dependencies(-1); - } - - /*! run this task */ - dll_export void TaskScheduler::Task::run (Thread& thread) { - run_internal(thread); - } - - bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) - { - /* stop if we run out of local tasks or reach the waiting task */ - if (right == 0 || &tasks[right-1] == parent) - return false; - - /* execute task */ - size_t oldRight = right; - tasks[right-1].run_internal(thread); - if (right != oldRight) { - THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); - } - - /* pop task and closure from stack */ - right--; - if (tasks[right].stackPtr != size_t(-1)) - stackPtr = tasks[right].stackPtr; - - /* also move left pointer */ - if (left >= right) left.store(right.load()); - - return right != 0; - } - - dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { - return execute_local_internal(thread,parent); - } - - bool TaskScheduler::TaskQueue::steal(Thread& thread) - { - size_t l = left; - size_t r = right; - if (l < r) - { - l = left++; - if (l >= r) - return false; - } - else - return false; - - if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) - return false; - - thread.tasks.right++; - return true; - } - - /* we steal from the left */ - size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() - { - if (left >= right) return 0; - return tasks[left].N; - } - - void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair) - { - TaskScheduler::ThreadPool* pool = pair->first; - size_t threadIndex = pair->second; - delete pair; - pool->thread_loop(threadIndex); - } - - TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) - : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} - - dll_export void TaskScheduler::ThreadPool::startThreads() - { - if (running) return; - setNumThreads(numThreads,true); - } - - void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) - { - Lock<MutexSys> lock(g_mutex); - assert(newNumThreads); - newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); - - // We are observing a few % gain by increasing number threads by 2 on aarch64. -#if defined(__aarch64__) && defined(BUILD_IOS) - numThreads = newNumThreads*2; -#else - numThreads = newNumThreads; -#endif - numThreads = newNumThreads; - if (!startThreads && !running) return; - running = true; - size_t numThreadsActive = numThreadsRunning; - - mutex.lock(); - numThreadsRunning = newNumThreads; - mutex.unlock(); - condition.notify_all(); - - /* start new threads */ - for (size_t t=numThreadsActive; t<numThreads; t++) - { - if (t == 0) continue; - auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t); - threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); - } - - /* stop some threads if we reduce the number of threads */ - for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { - if (t == 0) continue; - embree::join(threads.back()); - threads.pop_back(); - } - } - - TaskScheduler::ThreadPool::~ThreadPool() - { - /* leave all taskschedulers */ - mutex.lock(); - numThreadsRunning = 0; - mutex.unlock(); - condition.notify_all(); - - /* wait for threads to terminate */ - for (size_t i=0; i<threads.size(); i++) - embree::join(threads[i]); - } - - dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler) - { - mutex.lock(); - schedulers.push_back(scheduler); - mutex.unlock(); - condition.notify_all(); - } - - dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler) - { - Lock<MutexSys> lock(mutex); - for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { - if (scheduler == *it) { - schedulers.erase(it); - return; - } - } - } - - void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) - { - while (globalThreadIndex < numThreadsRunning) - { - Ref<TaskScheduler> scheduler = NULL; - ssize_t threadIndex = -1; - { - Lock<MutexSys> lock(mutex); - condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); - if (globalThreadIndex >= numThreadsRunning) break; - scheduler = schedulers.front(); - threadIndex = scheduler->allocThreadIndex(); - } - scheduler->thread_loop(threadIndex); - } - } - - TaskScheduler::TaskScheduler() - : threadCounter(0), anyTasksRunning(0), hasRootTask(false) - { - threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. - for (size_t i=0; i<threadLocal.size(); i++) - threadLocal[i].store(nullptr); - } - - TaskScheduler::~TaskScheduler() - { - assert(threadCounter == 0); - } - - dll_export size_t TaskScheduler::threadID() - { - Thread* thread = TaskScheduler::thread(); - if (thread) return thread->threadIndex; - else return 0; - } - - dll_export size_t TaskScheduler::threadIndex() - { - Thread* thread = TaskScheduler::thread(); - if (thread) return thread->threadIndex; - else return 0; - } - - dll_export size_t TaskScheduler::threadCount() { - return threadPool->size(); - } - - dll_export TaskScheduler* TaskScheduler::instance() - { - if (g_instance == NULL) { - Lock<MutexSys> lock(g_mutex); - g_instance = new TaskScheduler; - g_instance_vector.push_back(g_instance); - } - return g_instance; - } - - void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) - { - if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); - threadPool->setNumThreads(numThreads,start_threads); - } - - void TaskScheduler::destroy() { - delete threadPool; threadPool = nullptr; - } - - dll_export ssize_t TaskScheduler::allocThreadIndex() - { - size_t threadIndex = threadCounter++; - assert(threadIndex < threadLocal.size()); - return threadIndex; - } - - void TaskScheduler::join() - { - mutex.lock(); - size_t threadIndex = allocThreadIndex(); - condition.wait(mutex, [&] () { return hasRootTask.load(); }); - mutex.unlock(); - // -- GODOT start -- - // std::exception_ptr except = thread_loop(threadIndex); - // if (except != nullptr) std::rethrow_exception(except); - thread_loop(threadIndex); - // -- GODOT end -- - } - - void TaskScheduler::reset() { - hasRootTask = false; - } - - void TaskScheduler::wait_for_threads(size_t threadCount) - { - while (threadCounter < threadCount-1) - pause_cpu(); - } - - dll_export TaskScheduler::Thread* TaskScheduler::thread() { - return thread_local_thread; - } - - dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) - { - Thread* old = thread_local_thread; - thread_local_thread = thread; - return old; - } - - dll_export bool TaskScheduler::wait() - { - Thread* thread = TaskScheduler::thread(); - if (thread == nullptr) return true; - while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; - return thread->scheduler->cancellingException == nullptr; - } - -// -- GODOT start -- -// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) - void TaskScheduler::thread_loop(size_t threadIndex) -// -- GODOT end -- - { - /* allocate thread structure */ - std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation - Thread& thread = *mthread; - threadLocal[threadIndex].store(&thread); - Thread* oldThread = swapThread(&thread); - - /* main thread loop */ - while (anyTasksRunning) - { - steal_loop(thread, - [&] () { return anyTasksRunning > 0; }, - [&] () { - anyTasksRunning++; - while (thread.tasks.execute_local_internal(thread,nullptr)); - anyTasksRunning--; - }); - } - threadLocal[threadIndex].store(nullptr); - swapThread(oldThread); - - /* remember exception to throw */ - // -- GODOT start -- - // std::exception_ptr except = nullptr; - // if (cancellingException != nullptr) except = cancellingException; - // -- GODOT end -- - /* wait for all threads to terminate */ - threadCounter--; -#if defined(__WIN32__) - size_t loopIndex = 1; -#endif -#define LOOP_YIELD_THRESHOLD (4096) - while (threadCounter > 0) { -#if defined(__WIN32__) - if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) - yield(); - else - _mm_pause(); - loopIndex++; -#else - yield(); -#endif - } - // -- GODOT start -- - // return except; - return; - // -- GODOT end -- - } - - bool TaskScheduler::steal_from_other_threads(Thread& thread) - { - const size_t threadIndex = thread.threadIndex; - const size_t threadCount = this->threadCounter; - - for (size_t i=1; i<threadCount; i++) - { - pause_cpu(32); - size_t otherThreadIndex = threadIndex+i; - if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount; - - Thread* othread = threadLocal[otherThreadIndex].load(); - if (!othread) - continue; - - if (othread->tasks.steal(thread)) - return true; - } - - return false; - } - - dll_export void TaskScheduler::startThreads() { - threadPool->startThreads(); - } - - dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) { - threadPool->add(scheduler); - } - - dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) { - threadPool->remove(scheduler); - } - - RTC_NAMESPACE_END -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h deleted file mode 100644 index 8bd70b2b8c..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h +++ /dev/null @@ -1,386 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" -#include "../sys/atomic.h" -#include "../math/range.h" -#include "../../include/embree3/rtcore.h" - -#include <list> - -namespace embree -{ - - /* The tasking system exports some symbols to be used by the tutorials. Thus we - hide is also in the API namespace when requested. */ - RTC_NAMESPACE_BEGIN - - struct TaskScheduler : public RefCount - { - ALIGNED_STRUCT_(64); - friend class Device; - - static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack - static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures - - struct Thread; - - /*! virtual interface for all tasks */ - struct TaskFunction { - virtual void execute() = 0; - }; - - /*! builds a task interface from a closure */ - template<typename Closure> - struct ClosureTaskFunction : public TaskFunction - { - Closure closure; - __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} - void execute() { closure(); }; - }; - - struct __aligned(64) Task - { - /*! states a task can be in */ - enum { DONE, INITIALIZED }; - - /*! switch from one state to another */ - __forceinline void switch_state(int from, int to) - { - __memory_barrier(); - MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); - assert(success); - } - - /*! try to switch from one state to another */ - __forceinline bool try_switch_state(int from, int to) { - __memory_barrier(); - return state.compare_exchange_strong(from,to); - } - - /*! increment/decrement dependency counter */ - void add_dependencies(int n) { - dependencies+=n; - } - - /*! initialize all tasks to DONE state by default */ - __forceinline Task() - : state(DONE) {} - - /*! construction of new task */ - __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) - : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) - { - if (parent) parent->add_dependencies(+1); - switch_state(DONE,INITIALIZED); - } - - /*! construction of stolen task, stealing thread will decrement initial dependency */ - __forceinline Task (TaskFunction* closure, Task* parent) - : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) - { - switch_state(DONE,INITIALIZED); - } - - /*! try to steal this task */ - bool try_steal(Task& child) - { - if (!stealable) return false; - if (!try_switch_state(INITIALIZED,DONE)) return false; - new (&child) Task(closure, this); - return true; - } - - /*! run this task */ - dll_export void run(Thread& thread); - - void run_internal(Thread& thread); - - public: - std::atomic<int> state; //!< state this task is in - std::atomic<int> dependencies; //!< dependencies to wait for - std::atomic<bool> stealable; //!< true if task can be stolen - TaskFunction* closure; //!< the closure to execute - Task* parent; //!< parent task to signal when we are finished - size_t stackPtr; //!< stack location where closure is stored - size_t N; //!< approximative size of task - }; - - struct TaskQueue - { - TaskQueue () - : left(0), right(0), stackPtr(0) {} - - __forceinline void* alloc(size_t bytes, size_t align = 64) - { - size_t ofs = bytes + ((align - stackPtr) & (align-1)); - if (stackPtr + ofs > CLOSURE_STACK_SIZE) - // -- GODOT start -- - // throw std::runtime_error("closure stack overflow"); - abort(); - // -- GODOT end -- - stackPtr += ofs; - return &stack[stackPtr-bytes]; - } - - template<typename Closure> - __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) - { - if (right >= TASK_STACK_SIZE) - // -- GODOT start -- - // throw std::runtime_error("task stack overflow"); - abort(); - // -- GODOT end -- - - /* allocate new task on right side of stack */ - size_t oldStackPtr = stackPtr; - TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure); - /* gcc 8 or later fails to compile without explicit .load() */ - new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size); - right++; - - /* also move left pointer */ - if (left >= right-1) left = right-1; - } - - dll_export bool execute_local(Thread& thread, Task* parent); - bool execute_local_internal(Thread& thread, Task* parent); - bool steal(Thread& thread); - size_t getTaskSizeAtLeft(); - - bool empty() { return right == 0; } - - public: - - /* task stack */ - Task tasks[TASK_STACK_SIZE]; - __aligned(64) std::atomic<size_t> left; //!< threads steal from left - __aligned(64) std::atomic<size_t> right; //!< new tasks are added to the right - - /* closure stack */ - __aligned(64) char stack[CLOSURE_STACK_SIZE]; - size_t stackPtr; - }; - - /*! thread local structure for each thread */ - struct Thread - { - ALIGNED_STRUCT_(64); - - Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler) - : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} - - __forceinline size_t threadCount() { - return scheduler->threadCounter; - } - - size_t threadIndex; //!< ID of this thread - TaskQueue tasks; //!< local task queue - Task* task; //!< current active task - Ref<TaskScheduler> scheduler; //!< pointer to task scheduler - }; - - /*! pool of worker threads */ - struct ThreadPool - { - ThreadPool (bool set_affinity); - ~ThreadPool (); - - /*! starts the threads */ - dll_export void startThreads(); - - /*! sets number of threads to use */ - void setNumThreads(size_t numThreads, bool startThreads = false); - - /*! adds a task scheduler object for scheduling */ - dll_export void add(const Ref<TaskScheduler>& scheduler); - - /*! remove the task scheduler object again */ - dll_export void remove(const Ref<TaskScheduler>& scheduler); - - /*! returns number of threads of the thread pool */ - size_t size() const { return numThreads; } - - /*! main loop for all threads */ - void thread_loop(size_t threadIndex); - - private: - std::atomic<size_t> numThreads; - std::atomic<size_t> numThreadsRunning; - bool set_affinity; - std::atomic<bool> running; - std::vector<thread_t> threads; - - private: - MutexSys mutex; - ConditionSys condition; - std::list<Ref<TaskScheduler> > schedulers; - }; - - TaskScheduler (); - ~TaskScheduler (); - - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /*! lets new worker threads join the tasking system */ - void join(); - void reset(); - - /*! let a worker thread allocate a thread index */ - dll_export ssize_t allocThreadIndex(); - - /*! wait for some number of threads available (threadCount includes main thread) */ - void wait_for_threads(size_t threadCount); - - /*! thread loop for all worker threads */ - // -- GODOT start -- - // std::exception_ptr thread_loop(size_t threadIndex); - void thread_loop(size_t threadIndex); - // -- GODOT end -- - - /*! steals a task from a different thread */ - bool steal_from_other_threads(Thread& thread); - - template<typename Predicate, typename Body> - static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); - - /* spawn a new task at the top of the threads task stack */ - template<typename Closure> - void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) - { - if (useThreadPool) startThreads(); - - size_t threadIndex = allocThreadIndex(); - std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation - Thread& thread = *mthread; - assert(threadLocal[threadIndex].load() == nullptr); - threadLocal[threadIndex] = &thread; - Thread* oldThread = swapThread(&thread); - thread.tasks.push_right(thread,size,closure); - { - Lock<MutexSys> lock(mutex); - anyTasksRunning++; - hasRootTask = true; - condition.notify_all(); - } - - if (useThreadPool) addScheduler(this); - - while (thread.tasks.execute_local(thread,nullptr)); - anyTasksRunning--; - if (useThreadPool) removeScheduler(this); - - threadLocal[threadIndex] = nullptr; - swapThread(oldThread); - - /* remember exception to throw */ - std::exception_ptr except = nullptr; - if (cancellingException != nullptr) except = cancellingException; - - /* wait for all threads to terminate */ - threadCounter--; - while (threadCounter > 0) yield(); - cancellingException = nullptr; - - /* re-throw proper exception */ - if (except != nullptr) - std::rethrow_exception(except); - } - - /* spawn a new task at the top of the threads task stack */ - template<typename Closure> - static __forceinline void spawn(size_t size, const Closure& closure) - { - Thread* thread = TaskScheduler::thread(); - if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); - else instance()->spawn_root(closure,size); - } - - /* spawn a new task at the top of the threads task stack */ - template<typename Closure> - static __forceinline void spawn(const Closure& closure) { - spawn(1,closure); - } - - /* spawn a new task set */ - template<typename Index, typename Closure> - static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) - { - spawn(end-begin, [=]() - { - if (end-begin <= blockSize) { - return closure(range<Index>(begin,end)); - } - const Index center = (begin+end)/2; - spawn(begin,center,blockSize,closure); - spawn(center,end ,blockSize,closure); - wait(); - }); - } - - /* work on spawned subtasks and wait until all have finished */ - dll_export static bool wait(); - - /* returns the ID of the current thread */ - dll_export static size_t threadID(); - - /* returns the index (0..threadCount-1) of the current thread */ - dll_export static size_t threadIndex(); - - /* returns the total number of threads */ - dll_export static size_t threadCount(); - - private: - - /* returns the thread local task list of this worker thread */ - dll_export static Thread* thread(); - - /* sets the thread local task list of this worker thread */ - dll_export static Thread* swapThread(Thread* thread); - - /*! returns the taskscheduler object to be used by the master thread */ - dll_export static TaskScheduler* instance(); - - /*! starts the threads */ - dll_export static void startThreads(); - - /*! adds a task scheduler object for scheduling */ - dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler); - - /*! remove the task scheduler object again */ - dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler); - - private: - std::vector<atomic<Thread*>> threadLocal; - std::atomic<size_t> threadCounter; - std::atomic<size_t> anyTasksRunning; - std::atomic<bool> hasRootTask; - std::exception_ptr cancellingException; - MutexSys mutex; - ConditionSys condition; - - private: - static size_t g_numThreads; - static __thread TaskScheduler* g_instance; - static __thread Thread* thread_local_thread; - static ThreadPool* threadPool; - }; - - RTC_NAMESPACE_END - -#if defined(RTC_NAMESPACE) - using RTC_NAMESPACE::TaskScheduler; -#endif -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h deleted file mode 100644 index 776f98cdac..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#if !defined(__WIN32__) -#error PPL tasking system only available under windows -#endif - -#include <ppl.h> - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() { - return GetCurrentThreadId(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - /* FIXME: threadIndex is NOT supported by PPL! */ - static __forceinline size_t threadIndex() { - return 0; - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() { - return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; - } - }; -}; diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h deleted file mode 100644 index 98dba26871..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#if defined(__WIN32__) -# define NOMINMAX -#endif - -// We need to define these to avoid implicit linkage against -// tbb_debug.lib under Windows. When removing these lines debug build -// under Windows fails. -#define __TBB_NO_IMPLICIT_LINKAGE 1 -#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 -#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 -#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 -#include "tbb/tbb.h" -#include "tbb/parallel_sort.h" - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() - { - return threadIndex(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - static __forceinline size_t threadIndex() - { -#if TBB_INTERFACE_VERSION >= 9100 - return tbb::this_task_arena::current_thread_index(); -#elif TBB_INTERFACE_VERSION >= 9000 - return tbb::task_arena::current_thread_index(); -#else - return 0; -#endif - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() { -#if TBB_INTERFACE_VERSION >= 9100 - return tbb::this_task_arena::max_concurrency(); -#else - return tbb::task_scheduler_init::default_num_threads(); -#endif - } - - }; - -}; |